In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
import matplotlib.cm as cm #color plot
import matplotlib
import seaborn as sns
import csv
from math import *

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.decomposition import LatentDirichletAllocation,TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import re
import unicodedata


                the kernel may be left running.  Please let us know
                about your system (bitness, Python, etc.) at
                ipython-dev@scipy.org
  ipython-dev@scipy.org""")
  from numpy.core.umath_tests import inner1d


### Reading in the training csv data as a df

In [2]:
reviews_train = pd.read_csv('train.csv')
reviews_train = reviews_train.drop(columns=['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'unixReviewTime'])
reviews_train

Unnamed: 0,overall,reviewText,summary,categoryName
0,1,I bought this because my audio cord for the ca...,didn't help,CellPhone
1,1,"Arrived ok, but within a week started to reboo...",Coby Kyros 8 inch tablet,Electronics
2,5,Came on time and had a super long cord and had...,Awesome Awesome Awesome surge protector,Electronics
3,3,The Waterboys of the 80's are tough to discern...,Mike Scott in a weary land,CDVinyl
4,5,My laptop only have a micro hdmi output and th...,"Good quality, work as expected",Electronics
5,5,"Excellent fit, finish, and quality at a reason...",Excellent fit and finish.,Electronics
6,5,This has a fold-out plug for charging at a wal...,"Works great, batteries are not as pictured tho",Electronics
7,5,I've been using this case for 6 months now and...,Perfect for my N7,Electronics
8,4,I have an older xbox with the 20 gb hard drive...,Purchased for xbox storage,Electronics
9,4,This case fits my daughter's S-2110 tablet per...,it fits!!!!!!,Electronics


### Reading in the testing csv data as a df

In [3]:
reviews_test = pd.read_csv('test.csv')

reviews_test = reviews_test[['reviewText', 'summary', 'categoryName']]

reviews_test


Unnamed: 0,reviewText,summary,categoryName
0,This is black metal by definition with the evi...,pure black metal,CDVinyl
1,"Another classic, great for the family movie by...",My Neighbor Tortoro,MovieTV
2,I'm a big proponent for LG televisions because...,LG Television for the bedroom.,Electronics
3,Rock 'n Roll lost a true guitar legend when Al...,Alvin Lee......a true guitar legend,CDVinyl
4,I know I might stir the pot with this review. ...,One Eyed Jacks Blu-Ray Review,MovieTV
5,This product was shipped normal and arrived sl...,Seller stepped up,Electronics
6,"It works for ipad 3 too. Great protector, no c...",works for ipad 3 too,Electronics
7,Bought these to save some money on batteries. ...,Can't Charge Me,Electronics
8,it fits well and is great for taking a netbook...,is OK for price,Electronics
9,"I have seen Holmes in many guises, most of whi...",Holmes as Doyle would know him,MovieTV


### Assign each review into three categories then only pick out the reviews with 45 words or more

In [4]:

cat = ['low','neutral','high']
def cat_y(y):
    if y<=2.0:
        return cat[0]
    elif y>=4.0:
        return cat[2]
    else:
        return cat[1]

reviews_train = reviews_train[reviews_train['reviewText'].notnull()] # Makes sure the reviews are not NaN (empty cells)
reviews_train = reviews_train[reviews_train['reviewText'].apply(lambda x: len(x.split())>=45)] #splits the data into those with more than 45 word reviews
reviews_train['bucket'] = reviews_train['overall'].apply(cat_y) # apply the category sorting


#dropped rows having NaN values
reviews_train = reviews_train.dropna()

#re-index the dataframe
reviews_train = reviews_train.reset_index()
del reviews_train['index']

reviews_train

#shows the dataframe with the new category rankings and reviews that are more than 45 characters long

Unnamed: 0,overall,reviewText,summary,categoryName,bucket
0,5,Came on time and had a super long cord and had...,Awesome Awesome Awesome surge protector,Electronics,high
1,3,The Waterboys of the 80's are tough to discern...,Mike Scott in a weary land,CDVinyl,neutral
2,5,I've been using this case for 6 months now and...,Perfect for my N7,Electronics,high
3,4,I have an older xbox with the 20 gb hard drive...,Purchased for xbox storage,Electronics,high
4,4,the story is fast paced without being obnoxiou...,one of the better ones...,MovieTV,high
5,4,The movie arrived as promised and in excellent...,Romeo Must Die....again?,MovieTV,high
6,5,A must have if you are cloning laptop hard dri...,Excellent piece of gear,Electronics,high
7,5,"For the price vs performance, I really don't k...",Works and is simple and price is right,Electronics,high
8,4,Summary - Like a funny version of any Star Tre...,Funny satire on a bunch of space franchises,MovieTV,high
9,5,You can tell from a lot of review titles that ...,"Yes, it works!",Electronics,high


In [5]:
reviews_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20583 entries, 0 to 20582
Data columns (total 5 columns):
overall         20583 non-null int64
reviewText      20583 non-null object
summary         20583 non-null object
categoryName    20583 non-null object
bucket          20583 non-null object
dtypes: int64(1), object(4)
memory usage: 804.1+ KB


In [6]:
reviews_train.describe()

Unnamed: 0,overall
count,20583.0
mean,4.066511
std,1.232317
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [7]:
info = pd.pivot_table(reviews_train,index=['categoryName'],values=['overall', 'reviewText'],
               columns=[],aggfunc=[np.sum, np.mean],fill_value=0)

print(info.head())


info_test = pd.pivot_table(reviews_test,index=['categoryName'],
               columns=[],aggfunc=[np.sum],fill_value=0)

print(info_test.head())


                 sum      mean
             overall   overall
categoryName                  
CDVinyl        11660  4.388408
CellPhone       5791  3.955601
Electronics    39447  4.033023
MovieTV        26803  4.011825
                                                            sum  \
                                                     reviewText   
categoryName                                                      
CDVinyl       This is black metal by definition with the evi...   
CellPhone     This is a really great product that does exact...   
Electronics   I'm a big proponent for LG televisions because...   
MovieTV       Another classic, great for the family movie by...   
VideoGames    These headphones are great but nothing is perf...   

                                                                 
                                                        summary  
categoryName                                                     
CDVinyl       pure black metalAlvin Lee......a t

### Getting rid of contractions

In [8]:
#Getting rid of contractions
cList = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "I'd": "I would",
  "i've": "I have",
  "I'd've": "I would have",
  "I'll": "I will",
  "I'll've": "I will have",
  "I'm": "I am",
  "I've": "I have",
  "isn't": "is not",
  "it'd": "it had",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have",
  "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so is",
  "that'd": "that would",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there had",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "to've": "to have",
  "wasn't": "was not",
  "we'd": "we had",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have",
  "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'alls": "you alls",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you had",
  "you'd've": "you would have",
  "you'll": "you you will",
  "you'll've": "you you will have",
  "you're": "you are",
  "you've": "you have"
}

c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text, c_re=c_re):
    text = text.lower()
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text.lower())


all_reviews = reviews_train['reviewText']

all_clean_train_reviews = []

for i in range(0,20583):
    all_clean_train_reviews.append(expandContractions(all_reviews[i]))
    
reviews_train['reviewText_clean'] = all_clean_train_reviews

reviews_train['reviewText_clean']


0        came on time and had a super long cord and had...
1        the waterboys of the 80's are tough to discern...
2        I have been using this case for 6 months now a...
3        i have an older xbox with the 20 gb hard drive...
4        the story is fast paced without being obnoxiou...
5        the movie arrived as promised and in excellent...
6        a must have if you are cloning laptop hard dri...
7        for the price vs performance, i really do not ...
8        summary - like a funny version of any star tre...
9        you can tell from a lot of review titles that ...
10       i thought that this movie was pretty good. it ...
11       i live about 35 miles due west of chicago and ...
12       i fell for the hype.when deutsche grammophon a...
13       i bought this set of dvds when i heard that th...
14       this is basically the middle in the riddick sa...
15       arrived on time . this was a very good dvd for...
16       i always love diana's music. she has something.

In [9]:
all_reviews_test = reviews_test['reviewText']

all_clean_test_reviews = []

for i in range(0,1839):
    all_clean_test_reviews.append(expandContractions(all_reviews_test[i]))
    
reviews_test['reviewText_clean'] = all_clean_test_reviews

reviews_test['reviewText_clean']

0       this is black metal by definition with the evi...
1       another classic, great for the family movie by...
2       i'm a big proponent for lg televisions because...
3       rock 'n roll lost a true guitar legend when al...
4       i know i might stir the pot with this review. ...
5       this product was shipped normal and arrived sl...
6       it works for ipad 3 too. great protector, no c...
7       bought these to save some money on batteries. ...
8       it fits well and is great for taking a netbook...
9       i have seen holmes in many guises, most of whi...
10      i love everything about this camera except one...
11      it is cute, it has a nice color and attractive...
12      i could list a whole string of adjectives, but...
13      yuck frontman daniel blumberg made a curious d...
14      a handy item. when i put a long hdmi cable ins...
15      i first saw ryan adams in the 1990s with whisk...
16      i really like that it charges 2 devices at one...
17      cute a

In [10]:
# add number of characters column
reviews_train["nb_chars"] = reviews_train["reviewText"].apply(lambda x: len(x))

# add number of words column
reviews_train["nb_words"] = reviews_train["reviewText"].apply(lambda x: len(x.split(" ")))

In [11]:
# return the wordnet object value corresponding to the POS tag

import string
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    #stemming
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

# clean text data
reviews_train['reviewText_clean'] = reviews_train['reviewText_clean'].apply(lambda x: clean_text(x))


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\there\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\there\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\there\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# clean testing data
reviews_test['reviewText_clean'] = reviews_test['reviewText_clean'].apply(lambda x: clean_text(x))


In [13]:
reviews_train['reviewText_clean']

0        come time super long cord room plug everyth or...
1        waterboy tough discern fan big music celtic fo...
2        use case month still absolut love still sign w...
3        old xbox gb hard drive get full need purchas a...
4        stori fast pace without obnoxi know get jason ...
5        movi arriv promis excel condition.....lov movi...
6        must clone laptop hard drive work straight box...
7        price perform realli know el say purchas two r...
8        summari like funni version star trek episod ti...
9        tell lot review titl effect clean come somewha...
10       think movi pretti good good actor good directo...
11       live mile due west chicago abl get normal netw...
12       fell hype.when deutsch grammophon announc rele...
13       buy set dvd heard cw go remak seri modern tele...
14       basic middl riddick saga anim chronicl survivo...
15       arriv time good dvd grandchild love danceto so...
16       alway love diana' music someth differ offer al.

In [14]:
reviews_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20583 entries, 0 to 20582
Data columns (total 8 columns):
overall             20583 non-null int64
reviewText          20583 non-null object
summary             20583 non-null object
categoryName        20583 non-null object
bucket              20583 non-null object
reviewText_clean    20583 non-null object
nb_chars            20583 non-null int64
nb_words            20583 non-null int64
dtypes: int64(3), object(5)
memory usage: 1.3+ MB


### Vectorizing text with gensim

In [15]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

import gensim
from gensim.models.phrases import Phrases, Phraser
from gensim.models import word2vec
from gensim.test.utils import get_tmpfile, common_texts
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

ModuleNotFoundError: No module named 'gensim'

In [None]:
# Therese code for gensim

# note: reviewText_clean is the cleaned text after applying the clean_text function

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(reviews_train["reviewText_clean"].apply(lambda x: x.split(" ")))]

# train a Doc2Vec model with our text data
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

# transform each document into a vector data
doc2vec_df = reviews_train["reviewText_clean"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)
doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df.columns]
reviews_train = pd.concat([reviews_train, doc2vec_df], axis=1)

In [None]:
# gensim for the test data

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(reviews_test["reviewText_clean"].apply(lambda x: x.split(" ")))]

# train a Doc2Vec model with our text data
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

# transform each document into a vector data
doc2vec_df_test = reviews_test["reviewText_clean"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)
doc2vec_df_test.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df_test.columns]
reviews_test = pd.concat([reviews_test, doc2vec_df_test], axis=1)

In [None]:
reviews_train

In [None]:
# add tf-idfs columns for training data
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df = 10)
tfidf_result = tfidf.fit_transform(reviews_train["reviewText_clean"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = reviews_train.index
reviews_df = pd.concat([reviews_train, tfidf_df], axis=1)

In [None]:
# add tf-idfs columns for testing data

tfidf = TfidfVectorizer(min_df = 10)
tfidf_result_test = tfidf.fit_transform(reviews_test["reviewText_clean"]).toarray()
tfidf_df_test = pd.DataFrame(tfidf_result_test, columns = tfidf.get_feature_names())
tfidf_df_test.columns = ["word_" + str(x) for x in tfidf_df_test.columns]
tfidf_df_test.index = reviews_test.index
reviews_df = pd.concat([reviews_test, tfidf_df_test], axis=1)

In [None]:
# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
reviews_train["sentiments"] = reviews_train["reviewText_clean"].apply(lambda x: sid.polarity_scores(x))
reviews_train = pd.concat([reviews_train.drop(['sentiments'], axis=1), reviews_train['sentiments'].apply(pd.Series)], axis=1)

reviews_train

In [None]:
reviews_test["sentiments"] = reviews_test["reviewText_clean"].apply(lambda x: sid.polarity_scores(x))
reviews_test = pd.concat([reviews_test.drop(['sentiments'], axis=1), reviews_test['sentiments'].apply(pd.Series)], axis=1)

reviews_test

### NLTK

In [None]:
all_reviews = reviews_train['reviewText_clean']
all_sent_values = []
all_sentiments = []

all_reviews_test = reviews_test['reviewText_clean']
all_sent_values_test = []
all_sentiments_test = []



from nltk.sentiment.vader import SentimentIntensityAnalyzer
def sentiment_value(paragraph):
    analyser = SentimentIntensityAnalyzer()
    result = analyser.polarity_scores(paragraph)
    score = result['compound']
    return round(score, 4)

import nltk
nltk.download('vader_lexicon')

### Intervals
[ -1, -0.5) : 1, Very Negative

[-0.5, 0) : 2, Negative

[0] : 3, Neutral

(0, 0.5) : 4, Positive

[0.5, 1] : 5, Very Positive


In [None]:
all_reviews

In [None]:
all_reviews_test

In [None]:
for i in range(0,20368):
    all_sent_values.append(sentiment_value(all_reviews[i])) 
    
for i in range(0,1839):
    all_sent_values_test.append(sentiment_value(all_reviews_test[i])) 
    

In [None]:
len(all_sent_values)

len(all_sent_values_test)


In [None]:
# add raw score to dataframe
reviews_train['sentiment'] = all_sent_values

reviews_test['sentiment'] = all_sent_values_test

In [None]:
all_sent_values # ratings scaled from -1 to 1

In [None]:
# this changes the scale from -1 to 1 to a 1-5 scale. IGNORE FOR NOW

SENTIMENT_VALUE = []
SENTIMENT = []
for i in range(0,20368):
    sent = all_sent_values[i]
    if (sent<=1 and sent>=0.5):
        SENTIMENT.append('Very Positive')
        SENTIMENT_VALUE.append(5)
    elif (sent<0.5 and sent>0):
        SENTIMENT.append('Positive')
        SENTIMENT_VALUE.append(4)
    elif (sent==0):
        SENTIMENT.append('Neutral')
        SENTIMENT_VALUE.append(3)
    elif (sent<0 and sent>=-0.5):
        SENTIMENT.append('Negative')
        SENTIMENT_VALUE.append(2)
    else:
        SENTIMENT.append('Very Negative')
        SENTIMENT_VALUE.append(1)
        
reviews_train['SENTIMENT_VALUE'] = SENTIMENT_VALUE
reviews_train['SENTIMENT'] = SENTIMENT

In [None]:
reviews_train
#SENTIMENT_VALUE is the predicted rating --> disregard
#SENTIMENT is the verbal rating --> disregard


In [None]:
# NOTES

# for summary column, find frequency and use that to help predict the score

# calculate the frequency, then assign scores to groups of words --> then use this as a feature

# use product category as another feature

# *** use the outcome of the nlkt and utlize that as a feature along with the other given features in a logistic regression or other model

# basically use the nlkt as a model to generate a feature to be included in another model

# KNN can only be used as a continuous variable

# potential models: logistic model, decision tree, baseline logit model, Bayes classification, Bayes discriminant



### Natural word processing of the summary column data

In [None]:
all_summary = reviews_train['summary']
all_sum_values = []
all_sum_sentiments = []

for i in range(0,20368):
    all_sum_values.append(sentiment_value(all_summary[i])) 
    
    
all_summary_test = reviews_test['summary']
all_sum_values_test = []
all_sum_sentiments_test = []

for i in range(0,1839):
    all_sum_values_test.append(sentiment_value(all_summary_test[i]))
    

# add raw score to dataframe
reviews_train['sentiment_summary'] = all_sum_values

reviews_test['sentiment_summary'] = all_sum_values_test


In [None]:
# Disregard scaling from 1-5 for now

SUMMARY_SENT_VALUE = []
SUMMARY_SENT = []
for i in range(0,20368):
    sent = all_sum_values[i]
    if (sent<=1 and sent>=0.5):
        SUMMARY_SENT.append('Very Positive')
        SUMMARY_SENT_VALUE.append(5)
    elif (sent<0.5 and sent>0):
        SUMMARY_SENT.append('Positive')
        SUMMARY_SENT_VALUE.append(4)
    elif (sent==0):
        SUMMARY_SENT.append('Neutral')
        SUMMARY_SENT_VALUE.append(3)
    elif (sent<0 and sent>=-0.5):
        SUMMARY_SENT.append('Negative')
        SUMMARY_SENT_VALUE.append(2)
    else:
        SUMMARY_SENT.append('Very Negative')
        SUMMARY_SENT_VALUE.append(1)
        
reviews_train['SUMMARY_SENT_VALUE'] = SUMMARY_SENT_VALUE
reviews_train['SUMMARY_SENT'] = SUMMARY_SENT

In [None]:
# Disregard for now

SUMMARY_SENT_VALUE_2 = []
SUMMARY_SENT_2 = []

for i in range(0,20368):
    sent = reviews_train['compound'][i]
    if (sent<=1 and sent>=0.5):
        SUMMARY_SENT_2.append('Very Positive')
        SUMMARY_SENT_VALUE_2.append(5)
    elif (sent<0.5 and sent>0):
        SUMMARY_SENT_2.append('Positive')
        SUMMARY_SENT_VALUE_2.append(4)
    elif (sent==0):
        SUMMARY_SENT_2.append('Neutral')
        SUMMARY_SENT_VALUE_2.append(3)
    elif (sent<0 and sent>=-0.5):
        SUMMARY_SENT_2.append('Negative')
        SUMMARY_SENT_VALUE_2.append(2)
    else:
        SUMMARY_SENT_2.append('Very Negative')
        SUMMARY_SENT_VALUE_2.append(1)
        
reviews_train['SUMMARY_SENT_VALUE'] = SUMMARY_SENT_VALUE_2
reviews_train['SUMMARY_SENT'] = SUMMARY_SENT_2

In [None]:
# one hot encoding of product categories for training data

Electronics = []

CDVinyl = []

MovieTV = []

CellPhone= []

VideoGames = []

for i in range(0,20368):
    categories = reviews_train['categoryName'][i]
    
    if (categories=='Electronics'):
        Electronics.append(1)
    else: 
        Electronics.append(0)
    
    if (categories == 'CDVinyl'):
        CDVinyl.append(1)
    else: 
        CDVinyl.append(0)
    
    if (categories == 'MovieTV'):
        MovieTV.append(1)
    else: 
        MovieTV.append(0)
    
    if (categories == 'CellPhone'):
        CellPhone.append(1)
    else: 
        CellPhone.append(0)
    
    if (categories == 'VideoGames'):
        VideoGames.append(1)
    else: 
        VideoGames.append(0)
        
reviews_train['Electronics'] = Electronics
reviews_train['CDVinyl'] = CDVinyl
reviews_train['MovieTV'] = MovieTV
reviews_train['CellPhone'] = CellPhone
reviews_train['VideoGames'] = VideoGames


reviews_train

In [None]:
# one hot encoding of product categories for testing data

Electronics_ = []

CDVinyl_ = []

MovieTV_ = []

CellPhone_ = []

VideoGames_ = []

for i in range(0,1839):
    categories_test = reviews_test['categoryName'][i]
    
    if (categories_test=='Electronics'):
        Electronics_.append(1)
    else: 
        Electronics_.append(0)
    
    if (categories_test == 'CDVinyl'):
        CDVinyl_.append(1)
    else: 
        CDVinyl_.append(0)
    
    if (categories_test == 'MovieTV'):
        MovieTV_.append(1)
    else: 
        MovieTV_.append(0)

    if (categories_test == 'CellPhone'):
        CellPhone_.append(1)
    else: 
        CellPhone_.append(0)
        
    if (categories_test == 'VideoGames'):
        VideoGames_.append(1)
    else:
        VideoGames_.append(0)
        
reviews_test['Electronics'] = Electronics_
reviews_test['CDVinyl'] = CDVinyl_
reviews_test['MovieTV'] = MovieTV_
reviews_test['CellPhone'] = CellPhone_
reviews_test['VideoGames'] = VideoGames_


reviews_test


In [None]:
reviews_train.info()

## TRAINING AND FITTING MODELS

In [None]:
reviews_train

# variables: 'sentiment', 'sentiment_summary','neg', 'neu', 'pos', 'compound', 
# Electronics', 'CDVinyl', 'MovieTV', 'CellPhone', 'VideoGames', 

train_y = reviews_train['overall']

#train_x = reviews_train[['sentiment', 'sentiment_summary','neg', 'neu', 'pos']] # 0.52739 and 0.52719

#train_x = reviews_train[['sentiment', 'sentiment_summary','neg', 'neu', 'pos', 'compound']] # 0.52744 and 0.52719

train_x = reviews_train[['sentiment', 'sentiment_summary','neg', 'neu', 'pos', 'compound', 'Electronics', 'CDVinyl', 'MovieTV', 'CellPhone', 'VideoGames']] 
# 0.52867 and 0.52940

# train_x = reviews_train[['sentiment', 'sentiment_summary', 'compound', 'Electronics', 'CDVinyl', 'MovieTV', 'CellPhone', 'VideoGames']] # 0.52867 and 0.52940
# 0.52935 and 0.52911


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_x, train_y,
                                                    test_size = 0.3, random_state = 0)

# I've split up the overall training data set into a training and testing portion
# so I am able to compare results and accuracy


In [None]:
#test_x = reviews_test[['sentiment', 'sentiment_summary','neg', 'neu', 'pos', 'compound', 'Electronics', 'CDVinyl', 'MovieTV', 'CellPhone', 'VideoGames']]

### Logistic regression

In [None]:
# some packages and their uses

import pandas as pd 
import numpy as np
from sklearn import linear_model
from sklearn import metrics


In [None]:
# Train multi-classification model with logistic regression
lr = linear_model.LogisticRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

results_lr = pd.DataFrame({'Actual':y_test,'Predicted': y_pred_lr}) #prints out predicted ratings

results_lr


In [None]:
# Train multinomial logistic regression model
mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg').fit(X_train, y_train)

y_pred_mul_lr = mul_lr.predict(X_test)

results_mul_lr = pd.DataFrame({'Actual':y_test,'Predicted': y_pred_mul_lr}) #prints out predicted ratings

results_mul_lr

In [None]:
metrics.accuracy_score(y_test, lr.predict(X_test))

In [None]:
metrics.accuracy_score(y_test, mul_lr.predict(X_test))

### Decision Tree

In [None]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)


metrics.accuracy_score(y_test, clf.predict(X_test))

y_pred_clf = clf.predict(X_test)

results_clf = pd.DataFrame({'Actual':y_test,'Predicted': y_pred_clf}) #prints out predicted ratings

results_clf

In [None]:
from sklearn import metrics  
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_clf))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_clf))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_clf)))  

In [None]:
from sklearn.tree import DecisionTreeRegressor  

regressor = DecisionTreeRegressor()  
regressor.fit(X_train, y_train) 

y_pred = regressor.predict(X_test)

results_tree_regr = pd.DataFrame({'Actual':y_test,'Predicted': y_pred}) #prints out predicted ratings

results_tree_regr


In [None]:
from sklearn import metrics  
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))  

### Decision tree with Gini Index

In [None]:
from sklearn.metrics import confusion_matrix 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

clf_gini = DecisionTreeClassifier(criterion = "gini", 
            random_state = 100,max_depth=3, min_samples_leaf=5) 

clf_gini.fit(X_train, y_train) 

y_pred_gini = regressor.predict(X_test)

results_gini = pd.DataFrame({'Actual':y_test,'Predicted': y_pred_gini}) #prints out predicted ratings

results_gini

In [None]:
from sklearn import metrics  
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_gini))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_gini))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_gini)))  

### Decision tree with enthropy

In [None]:
clf_entropy = DecisionTreeClassifier( 
            criterion = "entropy", random_state = 100, 
            max_depth = 3, min_samples_leaf = 5) 
  
     
clf_entropy.fit(X_train, y_train) 

y_pred_enthropy = regressor.predict(X_test)

results_enthropy = pd.DataFrame({'Actual':y_test,'Predicted': y_pred_enthropy}) #prints out predicted ratings

results_enthropy


In [None]:
from sklearn import metrics  
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_enthropy))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_enthropy))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_enthropy))) 


In [None]:
#Naive Bayes on different vectors
#Notes: pick which model you want to use since we did TF IDF, Ngram and other vectors

# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), X_train_count, y_train, X_test_count )
print "NB, Count Vectors: ", accuracy

# Naive Bayes on Word Level TF IDF Vectors already implemented
accuracy = train_model(naive_bayes.MultinomialNB(), X_train_tfidf, y_train, X_test_tfidf)
print "NB, WordLevel TF-IDF: ", accuracy

# Naive Bayes on Ngram Level TF IDF Vectors already implented
accuracy = train_model(naive_bayes.MultinomialNB(),  X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print "NB, N-Gram Vectors: ", accuracy

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(),  X_train_tfidf_ngram_chars, y_train, X_test_tfidf_ngram_chars')
print "NB, CharLevel Vectors: ", accuracy

In [None]:
#SVM  Model
#see variables

# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(),  xtrain_tfidf_ngram, y_train, X_test_tfidf_ngram)
print "SVM, N-Gram Vectors: ", accuracy

In [None]:
#Bagging Model
#see variables please

# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(),  X_train, y_train, X_test)
print "RF, Count Vectors: ", accuracy

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(),  X_train_tfidf, y_train, X_test_tfidf)
print "RF, WordLevel TF-IDF: ", accuracy

In [None]:
# Shallow Nueral Network
#see variables  please

def create_model_architecture(input_size):
    # create input layer 
    input_layer = layers.Input((input_size, ), sparse=True)
    
    # create hidden layer
    hidden_layer = layers.Dense(100, activation="relu")(input_layer)
    
    # create output layer
    output_layer = layers.Dense(1, activation="sigmoid")(hidden_layer)

    classifier = models.Model(inputs = input_layer, outputs = output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    return classifier 

classifier = create_model_architecture('xtrain_tfidf_ngram.shape[1]')
accuracy = train_model(classifier, X_train_tfidf_ngram, y_train, X_test_tfidf_ngram, is_neural_net=True)
print "NN, Ngram Level TF IDF Vectors",  accuracy