# Word2Vec on Amazon Food review Dataset

Word2Vec machine learning algorithm that can be used to convert text data to vector.
Amazon fine foods review dataset is available on kaggle. (https://www.kaggle.com/snap/amazon-fine-food-reviews)

In this ipython notebook,I have performed the following steps:
    1. Loading data and assigning polarity
    2. Data cleaning by removing duplicate enteries and invalid information 
    3. Sort the data and sample it.
    4. Data Preprocessing:
        a.removing stop words
        b.removing punctuations and html tags if any
        c.stemming
        d.convert all words to lower case
    5.Split data into train and test. 
    6.Vectorize reviews using word2vec.Save this data.
   

1.Load data and assign polarity to reviews

In [1]:
%matplotlib inline

import sqlite3
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# using the SQLite Table to read data.
con = sqlite3.connect(r'C:\Users\Admin\Downloads\database.sqlite')


#Reading reviews that can be classified as positive or negative
review_data = pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3""", con) 


# Give reviews with Score>3 a positive rating, and reviews with a score<3 a negative rating.
def partition(x):
    if x < 3:
        return 0
    return 1

#changing reviews with score greater than 3 to be positive and vice-versa
review_data['Score'] = review_data['Score'].map(partition)

2.Data Cleaning

In [2]:
review_data =  review_data.drop_duplicates(subset={'UserId','ProfileName','Time','Text'},keep='first')

In [3]:
cleaned_data = review_data[review_data.HelpfulnessNumerator <= review_data.HelpfulnessDenominator]

3.Data sampling

In [4]:
cleaned_data.sort_values('Time',inplace=True,ascending=False) 
#sampled_data = cleaned_data.sample(frac=0.275,random_state=1), time series split function can also be used.
sampled_data=cleaned_data[0:100000]

4.Data preprocessing

In [5]:
import re
import string
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

stop = set(stopwords.words('english')) #set of stopwords
sno = nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer

def cleanhtml(sentence): #function to clean the word of any html-tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned

In [6]:
 i=0
str1=' '
final_string=[]
s=''
for sent in sampled_data['Text'].values:
    filtered_sentence=[]
    sent=cleanhtml(sent) # remove HTMl tags
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                else:
                    continue
            else:
                continue 

    str1 = b" ".join(filtered_sentence) #final string of cleaned words 
    final_string.append(str1)

In [7]:
sampled_data['CleanedText']=final_string

5.Split data into train and test

In [8]:
#We use 70% of data for training and 30% of data for test
import math
sampled_data.sort_values('Time',inplace=True,ascending=True) 

X_train =  sampled_data[:math.ceil(len(sampled_data)*.7)] 
X_test = sampled_data[math.ceil(len(sampled_data)*.3):]
y_train = sampled_data['Score'][:math.ceil(len(sampled_data)*.7)]
y_test =  sampled_data['Score'][math.ceil(len(sampled_data)*.3):]

6.Convert text to vector

Word2Vec

In [9]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim

In [10]:
i=0
str1=''
list_of_sent_train=[]
for sent in X_train['Text'].values:
    filtered_sentence=[]
    sent=cleanhtml(sent)
    str1 = ''
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (cleaned_words.lower() not in stop)):    
                filtered_sentence.append(cleaned_words.lower())
                str1 += " "+cleaned_words.lower() 
            else:
                continue
  
    list_of_sent_train.append(filtered_sentence)

In [11]:
w2v_model=gensim.models.Word2Vec(list_of_sent_train,min_count=5,size=50, workers=2)

In [12]:
i=0
str1=''
list_of_sent=[]
for sent in sampled_data['Text'].values:
    filtered_sentence=[]
    sent=cleanhtml(sent)
    str1 = ''
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (cleaned_words.lower() not in stop)):    
                filtered_sentence.append(cleaned_words.lower())
                str1 += " "+cleaned_words.lower() 
            else:
                continue
  
    list_of_sent.append(filtered_sentence)

In [13]:
sent_vectors = []; 
for sent in list_of_sent: 
    sent_vec = np.zeros(50)
    cnt_words =0; 
    for word in sent: 
        try:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
        except:
            pass
    sent_vec /= cnt_words
    sent_vectors.append(sent_vec)

In [14]:
X_train = sent_vectors[:math.ceil(len(sampled_data)*.7)]  #final_w2v_count
X_test = sent_vectors[math.ceil(len(sampled_data)*.3):] #final_w2v_count

In [15]:
import hickle as hkl 
hkl.dump( X_train, 'avgw2v_train.hkl' )
hkl.dump( X_test, 'avgw2v_test.hkl' )
y_train.to_csv('y_train', encoding='utf-8', index=False,header =True)
y_test.to_csv('y_test', encoding='utf-8', index=False,header =True)

In [None]:
# import hickle as hkl 
# import pandas as pd
# X_train = hkl.load( 'avgw2v_train.hkl' )
# X_test = hkl.load( 'avgw2v_train.hkl' )
# y_train=pd.read_csv("y_train") 
# y_test=pd.read_csv("y_test")

In [16]:
w2v_model.wv.most_similar('good')

[('great', 0.8194330930709839),
 ('decent', 0.7578660249710083),
 ('awesome', 0.752179741859436),
 ('fantastic', 0.7468140125274658),
 ('nice', 0.6891762018203735),
 ('excellent', 0.6756437420845032),
 ('amazing', 0.6694375276565552),
 ('wonderful', 0.6658943891525269),
 ('terrific', 0.6564239263534546),
 ('fabulous', 0.6526327133178711)]

In [18]:
# import hickle as hkl 
# import pandas as pd
# X_train = hkl.load( 'avgw2v_train.hkl' )
# X_test = hkl.load( 'avgw2v_train.hkl' )
# y_train=pd.read_csv("y_train") 
# y_test=pd.read_csv("y_test")

In [17]:
# from sklearn.neighbors import KNeighborsClassifier
# knn = KNeighborsClassifier(n_neighbors=5)
# knn.fit(X_train, y_train)
# brute_pred = knn.predict(X_test)