In [1]:
## Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import sqlite3    ## SQL Interface
import pickle     ## Used to save your data - Converts objects to byte stream and vice versa

from sklearn.feature_extraction.text import CountVectorizer  ## BOW Model
from sklearn.feature_extraction.text import TfidfVectorizer  ## TFIDF Model

from sklearn.manifold import TSNE    ## To visualize high dimensional data

## Modules to perform Text Preprocessing
import re
import nltk
from nltk.corpus import stopwords

import gensim    ## To build Word2Vec model



In [2]:
conn = sqlite3.connect('final.sqlite')
final = pd.read_sql_query("""SELECT * FROM Reviews""", conn)
conn.close()
final.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText,CleanedNoStem
0,540125,B002W08W56,A2STPOZAT37RNE,Betty Baker,1,1,1,1300320000,"Wonderful, but surprised to find this size can...",Great beans....the best texture hands down. T...,great bean best textur hand seem great tast do...,great beans best texture hands seem great tast...
1,31763,B000UXWQMC,A115Y8R40DCT3I,Guy,0,0,1,1345161600,better than expected,I usually buy whole pitted olives and when I b...,usual buy whole pit oliv bought good wasnt who...,usually buy whole pitted olives bought good wa...
2,509504,B000P09RJA,A2BR8LGC8N6XQD,dj2u,0,4,0,1278374400,Wrong size bars sent,The first shipment was just as ordered. This ...,first shipment order last shipment not bar wen...,first shipment ordered last shipment not bars ...
3,27728,B000K8ESBY,A1RYTIMIWPDJ4O,"Bill Ison ""Bill""",0,0,1,1256169600,Coffee flavoring,"<a href=""http://www.amazon.com/gp/product/B000...",flavour creation coffe flavor tablet french va...,flavour creations coffee flavoring tablets fre...
4,128657,B001AVJT0K,AJ6SZ4YAPOOO7,Mwebi,0,0,1,1348099200,They all come running,"I have the pickiest cats, they hate 99/100 kin...",pickiest cat hate kind food better vari otherw...,pickiest cats hate kinds food better vary othe...


In [3]:
stop = set(stopwords.words('english')) #set of stopwords
lst = ['won', 'nor', 'not', 'against']
for word in lst:
    stop.remove(word)
print(stop)

{'should', 'themselves', 'her', 'there', 'did', 'herself', 'aren', 'you', 'between', 'until', 'been', 'both', 'couldn', 'had', 'my', 'll', 'doesn', 'through', 'wouldn', 'them', 'which', 'their', 'ours', 'yourselves', 'who', 'into', 'before', 'other', 've', 'its', 'down', 'more', 'here', 'yours', 'does', 'has', 'only', 'it', 'me', 'in', 'his', 'over', 'while', 'that', 'when', 'having', 'mustn', 'isn', 'doing', 'whom', 'but', 'mightn', 'ain', 'each', 'o', 'shan', 'itself', 'of', 'during', 'shouldn', 'be', 'him', 'again', 'under', 'out', 'or', 'how', 'further', 'he', 'same', 'once', 'weren', 'yourself', 'theirs', 'your', 'by', 'why', 'were', 'are', 'from', 'they', 'himself', 'and', 'haven', 'y', 'an', 'a', 'hers', 'than', 'very', 'what', 'needn', 'm', 'myself', 'own', 'for', 're', 'all', 'if', 'few', 'ourselves', 'where', 'about', 'such', 'do', 'this', 'can', 'those', 'is', 'hadn', 'hasn', 'below', 'now', 'so', 'as', 'being', 'will', 'up', 'she', 'too', 'because', 'after', 'i', 'was', 'to

In [4]:
sno = nltk.stem.SnowballStemmer('english')

In [5]:
def cleanhtml(sentence):
    '''This function removes all the html tags in the given sentence'''
    cleanr = re.compile('<.*?>')    ## find the index of the html tags
    cleantext = re.sub(cleanr, ' ', sentence)  ## Substitute <space> in place of any html tag
    return cleantext

def cleanpunc(sentence):
    '''This function cleans all the punctuation or special characters from a given sentence'''
    cleaned = re.sub(r'[?|@|!|^|%|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned

In [11]:
def preprocessing(series):
    '''The function takes a Pandas Series object containing text in all the cells
       And performs following Preprocessing steps on each cell:
       1. Clean text from html tags
       2. Clean text from punctuations and special characters
       3. Retain only non-numeric Latin characters with lenght > 2
       4. Remove stopwords from the sentence
       5. Apply stemming to all the words in the sentence
       
       Return values:
       1. final_string - List of cleaned sentences
       2. list_of_sent - List of lists which can be used as input to the W2V model'''
    
    i = 0
    str1=" "
    final_string = []    ## This list will contain cleaned sentences
    list_of_sent = []    ## This is a list of lists used as input to the W2V model at a later stage
    
    
    for sent in series.values:
        ## 
        filtered_sent = []
        sent = cleanhtml(sent)    ## Clean the HTML tags
        sent = cleanpunc(sent)    ## Clean the punctuations and special characters
        ## Sentences are cleaned and words are handled individually
        for cleaned_words in sent.split():
            ## Only consider non-numeric words with length at least 3
            if((cleaned_words.isalpha()) and (len(cleaned_words) > 2)):
                ## Only consider words which are not stopwords and convert them to lowet case
                if(cleaned_words.lower() not in stop):
                    ## Apply snowball stemmer and add them to the filtered_sent list
                    #s = (sno.stem(cleaned_words.lower()))#.encode('utf-8')
                    s = cleaned_words.lower()
                    filtered_sent.append(s)    ## This contains all the cleaned words for a sentence
        ## Below list is a list of lists used as input to W2V model later
        list_of_sent.append(filtered_sent)
        ## Join back all the words belonging to the same sentence
        str1 = " ".join(filtered_sent)
        ## Finally add the cleaned sentence in the below list
        final_string.append(str1)
        #print(i)
        i += 1
    return final_string, list_of_sent

In [12]:
final_string, list_of_sent = preprocessing(final['Text'])

In [13]:
len(final_string)

364171

In [14]:
len(list_of_sent)

364171

In [16]:
final['CleanedNoStem']=final_string

In [21]:
final.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText,CleanedNoStem
0,540125,B002W08W56,A2STPOZAT37RNE,Betty Baker,1,1,1,1300320000,"Wonderful, but surprised to find this size can...",Great beans....the best texture hands down. T...,great bean best textur hand seem great tast do...,great beans best texture hands seem great tast...
1,31763,B000UXWQMC,A115Y8R40DCT3I,Guy,0,0,1,1345161600,better than expected,I usually buy whole pitted olives and when I b...,usual buy whole pit oliv bought good wasnt who...,usually buy whole pitted olives bought good wa...
2,509504,B000P09RJA,A2BR8LGC8N6XQD,dj2u,0,4,0,1278374400,Wrong size bars sent,The first shipment was just as ordered. This ...,first shipment order last shipment not bar wen...,first shipment ordered last shipment not bars ...
3,27728,B000K8ESBY,A1RYTIMIWPDJ4O,"Bill Ison ""Bill""",0,0,1,1256169600,Coffee flavoring,"<a href=""http://www.amazon.com/gp/product/B000...",flavour creation coffe flavor tablet french va...,flavour creations coffee flavoring tablets fre...
4,128657,B001AVJT0K,AJ6SZ4YAPOOO7,Mwebi,0,0,1,1348099200,They all come running,"I have the pickiest cats, they hate 99/100 kin...",pickiest cat hate kind food better vari otherw...,pickiest cats hate kinds food better vary othe...


In [22]:
conn = sqlite3.connect('final.sqlite')
c=conn.cursor()
final.to_sql('Reviews', conn, if_exists='replace', index = False)
conn.close()

#### Write list_of_sent

In [24]:
with open('list_of_sent_nostem_input_to_w2v.pkl', 'wb') as pickle_file:
    pickle.dump(list_of_sent, pickle_file)

#### Read list_of_sent

In [3]:
with open('list_of_sent_nostem_input_to_w2v.pkl', 'rb') as pickle_file:
    list_of_sent = pickle.load(pickle_file)

#### Read Google W2V model

In [5]:
with open('google_word2vec_model', 'rb') as pickle_file:
    google_w2v = pickle.load(pickle_file)

In [28]:
type(google_w2v)

dict

In [29]:
len(google_w2v)

46603

In [5]:
word = 'trying'
google_w2v[word]

array([ 0.4375    ,  0.25195312,  0.15332031, -0.01312256, -0.07714844,
        0.00747681,  0.16113281, -0.03393555, -0.17285156, -0.22460938,
       -0.02160645, -0.01831055, -0.23242188, -0.24511719, -0.33007812,
       -0.19042969, -0.05908203,  0.16992188,  0.19433594, -0.12890625,
       -0.13867188,  0.05419922,  0.03564453, -0.31640625, -0.03881836,
       -0.18847656, -0.08154297,  0.25585938,  0.01940918, -0.00349426,
        0.30664062,  0.07568359,  0.15136719, -0.27148438,  0.02380371,
        0.16796875,  0.17382812, -0.05297852,  0.19726562,  0.03759766,
       -0.05200195, -0.171875  ,  0.04150391, -0.24121094, -0.18359375,
        0.07910156, -0.02587891,  0.15625   ,  0.03198242, -0.02978516,
       -0.18847656,  0.08447266,  0.0255127 ,  0.05859375,  0.09765625,
        0.19238281,  0.03833008, -0.08398438,  0.21582031,  0.02600098,
        0.07958984,  0.3203125 , -0.19433594, -0.12890625,  0.14355469,
        0.08886719,  0.14257812,  0.25195312,  0.03112793, -0.27

#### Avg W2V

In [6]:
def calc_avg_w2v(list_of_sent, w2v_model):
    sent_vectors = []
    for sent in list_of_sent:
        sent_vec = np.zeros(300)
        cnt_words = 0
        for word in sent:
            try:
                vec = w2v_model[word]
                sent_vec += vec
                cnt_words += 1
            except:
                pass
        sent_vec /= cnt_words
        sent_vectors.append(sent_vec)
    return sent_vectors

In [7]:
sent_vectors = calc_avg_w2v(list_of_sent, google_w2v)

  del sys.path[0]


In [8]:
with open('avg_w2v.pkl', 'wb') as pickle_file:
    pickle.dump(sent_vectors, pickle_file)

In [9]:
len(sent_vectors)

364171

In [10]:
len(sent_vectors[0])

300

In [11]:
type(sent_vectors)

list

#### TFIDF Avg W2V

In [4]:
tf_idf_vect = TfidfVectorizer()
tf_idf = tf_idf_vect.fit_transform(final['CleanedNoStem'].values)
tfidf_feat = tf_idf_vect.get_feature_names()
tf_idf.shape

(364171, 102974)

In [11]:
def calc_tfidf_avg_w2v(list_of_sent, w2v_model, tf_idf, tfidf_feat, start_row):
    tfidf_sent_vectors = []
    for sent in list_of_sent:
        sent_vec = np.zeros(300)
        weighted_sum = 0
        for word in sent:
            try:
                vec = w2v_model[word]
                tfidf = tf_idf[start_row, tfidf_feat.index(word)]
                sent_vec += vec*tfidf
                weighted_sum += tfidf
            except:
                pass
        sent_vec /= weighted_sum
        print(start_row, weighted_sum)
        tfidf_sent_vectors.append(sent_vec)
        start_row += 1
    return tfidf_sent_vectors

In [23]:
tfidf_sent_vectors = calc_tfidf_avg_w2v(list_of_sent[:100], google_w2v, tf_idf, tfidf_feat, 0)

0 7.05824484167
1 4.29534165439
2 3.15120498183
3 3.34949036076
4 5.90668317375
5 3.33076784704
6 3.77942915256
7 2.48461900791
8 4.21546345129
9 6.96560725258
10 3.93934246931
11 3.84195775195
12 4.98158353998
13 1.58615224732
14 3.4877487538
15 5.88402226711
16 8.9535468818
17 3.59974036299
18 4.25337415165
19 9.44081055633
20 7.26330532951
21 3.0750549686
22 11.6694406683
23 3.61914919876
24 5.39481795571
25 1.52897695443
26 2.32254104369
27 11.4175071738
28 6.31762490448
29 2.19051568024
30 4.27018640742
31 15.4152430309
32 7.85542812272
33 2.19833610892
34 3.59112968878
35 3.7971775038
36 5.7191392331
37 3.01143468736
38 2.93456570865
39 4.25989606053
40 1.14385403151
41 5.19366216375
42 3.19059518485
43 3.66991837376
44 5.24235573987
45 3.67440347387
46 2.31787998968
47 7.06735126407
48 2.64584687892
49 1.97596748595
50 3.13610345881
51 4.64506040729
52 7.49827390805
53 3.6904498765
54 4.44422904449
55 4.13094397305
56 2.53648013992
57 3.12483581466
58 1.99670307963
59 6.02322502