**OBJECTIVE:**
    
Given a review to determine whether a review is positive or negative.

In [1]:
%matplotlib inline
import sqlite3
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

#### Establishing connection with database

In [2]:
con = sqlite3.connect('./database.sqlite')

#### SQL Query to filter the reviews

In [3]:
filtered=pd.read_sql_query("""
SELECT * FROM Reviews WHERE Score!=3
""",con)

In [4]:
def partition(x):
    if x<3:
        return 'negative'
    return 'positive'

#### Changing the numbers(1,2,3,4,5) to positive or negative reviews

In [5]:
actualScore = filtered['Score']
positiveNegative = actualScore.map(partition)
filtered['Score']=positiveNegative

In [6]:
filtered.shape
filtered.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


## Data Cleaning: Deduplication

In [7]:
display = pd.read_sql_query("""
SELECT * FROM Reviews WHERE Score!=3 AND UserId="AR5J8UI46CURR" ORDER BY ProductID
""", con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


#### Sorting data according to productID

In [8]:
sorted_=filtered.sort_values('ProductId',axis=0,ascending=True)

In [9]:
final=sorted_.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first',inplace=False)
final.shape

(364173, 10)

#### To check how much %age of data still left

In [10]:
(final['Id'].size*1.0)/(filtered['Id'].size*1.0)*100

69.25890143662969

In [11]:
display = pd.read_sql_query("""
SELECT * FROM Reviews WHERE Score!=3 AND Id=44737 OR Id=64422 ORDER BY ProductID
""",con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [12]:
final = final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]

In [13]:
print(final.shape)
final['Score'].value_counts()

(364171, 10)


positive    307061
negative     57110
Name: Score, dtype: int64

### Bag of Words(BOW)

In [14]:
count=CountVectorizer()
final_c=count.fit_transform(final['Text'].values)

In [15]:
type(final_c)

scipy.sparse.csr.csr_matrix

In [16]:
final_c.get_shape()

(364171, 115281)

### Text Preprocessing:  Stemming, stop-word removal and Lemmatization

#### Find sentences containing HTML tags


In [17]:
import re
i=0;
for sent in final['Text'].values:
    if(len(re.findall('<.*?>', sent))):
        print(i)
        print(sent)
        break;
    i+=1;

6
I set aside at least an hour each day to read to my son (3 y/o). At this point, I consider myself a connoisseur of children's books and this is one of the best. Santa Clause put this under the tree. Since then, we've read it perpetually and he loves it.<br /><br />First, this book taught him the months of the year.<br /><br />Second, it's a pleasure to read. Well suited to 1.5 y/o old to 4+.<br /><br />Very few children's books are worth owning. Most should be borrowed from the library. This book, however, deserves a permanent spot on your shelf. Sendak's best.


#### Regular Expressions

In [18]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

stop=set(stopwords.words('english'))
sno=nltk.stem.SnowballStemmer('english')

def cleanhtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr,' ',sentence)
    return cleantext

def cleanpunc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return cleaned

print(stop)
print("********************************")
print(sno.stem('tasty'))

{"shan't", 'having', 'you', 'does', 'hadn', 'all', 'be', 'theirs', 'up', 't', 'whom', 'aren', 'what', 'd', "hadn't", 'can', 'do', 'herself', 'it', 'few', 'just', 'he', 'no', "should've", 'my', 'their', 's', 'a', 'haven', 'for', 'under', 'ourselves', 'until', 'hasn', "weren't", 'more', 'about', 'am', "mightn't", "you're", 'll', 'and', 'were', 'by', 'such', "won't", 'myself', "mustn't", 'himself', 'the', 'was', 'ain', "couldn't", 'itself', 'from', "that'll", 'as', 'during', "she's", 'then', "you'd", 'there', 'its', 'yours', 'isn', 'into', 'being', 'some', 'y', 'an', 'in', 'to', 'very', 'than', 'themselves', 'above', 'off', 'only', 've', 'now', "isn't", 'or', 'once', 'these', "wouldn't", 'shan', 'this', 'mightn', 'between', 'if', 'his', 'those', 'shouldn', "it's", 'other', "aren't", 'i', 'doing', 'weren', 'again', 'both', "don't", 'out', 'how', 'too', 'of', 'have', 'same', 'him', 'but', 'we', 'me', 'yourselves', 'her', "hasn't", 'which', 'had', "needn't", 'doesn', 'won', 're', "haven't", 

In [20]:
i=0
str1=' '
final_s=[]
positive_w=[]
negative_w=[]
s=''
for sent in final['Text'].values:
    filtered=[]
    sent=cleanhtml(sent)
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha())&(len(cleaned_words)>2)):
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode('utf-8')
                    filtered.append(s)
                    if(final['Score'].values)[i]=='positive':
                        positive_w.append(s)
                    if(final['Score'].values)[i]=='negative':
                        negative_w.append(s)
                else:
                    continue
            else:
                continue
    str1=b" ".join(filtered)
    final_s.append(str1)
    i+=1

In [21]:
final['CleanedText']=final_s

In [22]:
final.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,b'witti littl book make son laugh loud recit c...
138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",b'grew read sendak book watch realli rosi movi...
138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,b'fun way children learn month year learn poem...


#### store final table to sqlite table for future.

In [23]:
conn=sqlite3.connect('final.sqlite')
c=conn.cursor()
conn.text_factory=str
final.to_sql('Reviews',conn,schema=None,if_exists='replace')

## Bi-Grams and n-Grams

In [24]:
freq_dist_p=nltk.FreqDist(positive_w)
freq_dist_n=nltk.FreqDist(negative_w)
print("Most Common Positive Words: ",freq_dist_p.most_common(20))
print("Most Common Negative Words: ",freq_dist_n.most_common(20))

Most Common Positive Words:  [(b'like', 139429), (b'tast', 129047), (b'good', 112766), (b'flavor', 109624), (b'love', 107357), (b'use', 103888), (b'great', 103870), (b'one', 96726), (b'product', 91033), (b'tri', 86791), (b'tea', 83888), (b'coffe', 78814), (b'make', 75107), (b'get', 72125), (b'food', 64802), (b'would', 55568), (b'time', 55264), (b'buy', 54198), (b'realli', 52715), (b'eat', 52004)]
Most Common Negative Words:  [(b'tast', 34585), (b'like', 32330), (b'product', 28218), (b'one', 20569), (b'flavor', 19575), (b'would', 17972), (b'tri', 17753), (b'use', 15302), (b'good', 15041), (b'coffe', 14716), (b'get', 13786), (b'buy', 13752), (b'order', 12871), (b'food', 12754), (b'dont', 11877), (b'tea', 11665), (b'even', 11085), (b'box', 10844), (b'amazon', 10073), (b'make', 9840)]


In [25]:
count_vect=CountVectorizer(ngram_range=(1,2))#1 gram, 2 gram
final_bigram_c=count_vect.fit_transform(final['Text'].values)

In [26]:
final_bigram_c.get_shape()

(364171, 2910192)

## Tf-Idf

In [27]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
final_tf_idf = tf_idf_vect.fit_transform(final['Text'].values)

In [28]:
final_tf_idf.get_shape()

(364171, 2910192)

In [29]:
features=tf_idf_vect.get_feature_names()
len(features)

2910192

In [30]:
features[100000:100010]

['ales until',
 'ales ve',
 'ales would',
 'ales you',
 'alessandra',
 'alessandra ambrosia',
 'alessi',
 'alessi added',
 'alessi also',
 'alessi and']

In [31]:
# convert a row in sparsematrix to a numpy array
print(final_tf_idf[3,:].toarray()[0])

[0. 0. 0. ... 0. 0. 0.]


In [32]:
def top_tfidf_feats(row, features, top_n=25):
    topn_ids=np.argsort(row)[::-1][:top_n]
    top_feats=[(features[i], row[i])]
    df = pd.DataFrame(top_feats)
    df.columns=['feature','tfidf']
    return df

top_tfidf = top_tfidf_feats(final_tf_idf[1,:].toarray()[0],features,25)

In [33]:
top_tfidf

Unnamed: 0,feature,tfidf
0,bored one,0.0


### Word2Vect


In [36]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

In [40]:
import gensim
i=0
list_sent=[]
for sent in final['Text'].values:
    filtered_s=[]
    sent=cleanhtml(sent)
    for w in sent.split():
        for cleaned_w in cleanpunc(w).split():
            if(cleaned_w.isalpha()):
                filtered_s.append(cleaned_w.lower())
            else:
                continue
    list_sent.append(filtered_s)

In [41]:
print(final['Text'].values[0])
print('----------------------------------')
print(list_sent[0])

this witty little book makes my son laugh at loud. i recite it in the car as we're driving along and he always can sing the refrain. he's learned about whales, India, drooping roses:  i love all the new words this book  introduces and the silliness of it all.  this is a classic book i am  willing to bet my son will STILL be able to recite from memory when he is  in college
----------------------------------
['this', 'witty', 'little', 'book', 'makes', 'my', 'son', 'laugh', 'at', 'loud', 'i', 'recite', 'it', 'in', 'the', 'car', 'as', 'were', 'driving', 'along', 'and', 'he', 'always', 'can', 'sing', 'the', 'refrain', 'hes', 'learned', 'about', 'whales', 'india', 'drooping', 'i', 'love', 'all', 'the', 'new', 'words', 'this', 'book', 'introduces', 'and', 'the', 'silliness', 'of', 'it', 'all', 'this', 'is', 'a', 'classic', 'book', 'i', 'am', 'willing', 'to', 'bet', 'my', 'son', 'will', 'still', 'be', 'able', 'to', 'recite', 'from', 'memory', 'when', 'he', 'is', 'in', 'college']


In [42]:
w2v_model=gensim.models.Word2Vec(list_sent,min_count=5,size=50,workers=4)

In [43]:
words=list(w2v_model.wv.vocab)
print(len(words))

33783


In [44]:
w2v_model.wv.most_similar('tasty')

[('tastey', 0.8986448645591736),
 ('yummy', 0.8659712076187134),
 ('satisfying', 0.8476494550704956),
 ('delicious', 0.8186174035072327),
 ('filling', 0.8180630207061768),
 ('tasteful', 0.8161405920982361),
 ('flavorful', 0.8038270473480225),
 ('addicting', 0.7637023329734802),
 ('nutritious', 0.762311577796936),
 ('versatile', 0.7530726194381714)]

In [45]:
w2v_model.wv.most_similar('like')

[('resemble', 0.7276737689971924),
 ('dislike', 0.661806583404541),
 ('mean', 0.6591792106628418),
 ('prefer', 0.6584094166755676),
 ('overpower', 0.6121314764022827),
 ('enjoy', 0.6037275195121765),
 ('think', 0.6002272367477417),
 ('miss', 0.5884632468223572),
 ('overwhelm', 0.5825260281562805),
 ('fake', 0.5701082348823547)]

In [47]:
count_feat=count_vect.get_feature_names()
count_feat.index('like')
print(count_feat[64055])

activity great


### Average w2v, TFIDF-w2v

#### compute average word2vec for each review

In [48]:
sent_vectors=[];
for sent in list_sent:
    sent_vect=np.zeros(50)
    count=0;
    for word in sent:
        try:
            vec = w2v_model.wv[word]
            sent_vect += vec
            count += 1
        except:
            pass
    sent_vect/=count
    sent_vectors.append(sent_vect)
print(len(sent_vectors))
print(len(sent_vectors[0]))

  if sys.path[0] == '':


364171
50
