In [1]:
from nltk.corpus import stopwords
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack

In [582]:
df = pd.read_table('amazon_alexa.tsv', sep = '\t')
df

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
...,...,...,...,...,...
3145,5,30-Jul-18,Black Dot,"Perfect for kids, adults and everyone in betwe...",1
3146,5,30-Jul-18,Black Dot,"Listening to music, searching locations, check...",1
3147,5,30-Jul-18,Black Dot,"I do love these things, i have them running my...",1
3148,5,30-Jul-18,White Dot,Only complaint I have is that the sound qualit...,1


Lexicon Based method using VADER

In [583]:
lexicon = df.iloc[:, 3:5]
lexicon

Unnamed: 0,verified_reviews,feedback
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1
...,...,...
3145,"Perfect for kids, adults and everyone in betwe...",1
3146,"Listening to music, searching locations, check...",1
3147,"I do love these things, i have them running my...",1
3148,Only complaint I have is that the sound qualit...,1


Make the strings all lowercase

In [584]:
lexicon['verified_reviews'] = lexicon['verified_reviews'].str.lower()

Removing punctuations except !

In [585]:
lexicon['verified_reviews'] = lexicon['verified_reviews'].str.replace(r'[^a-zA-Z0-9! ]+', '', regex=True)


Removing Stop Words

In [586]:
stop = stopwords.words('english')

In [587]:
lexicon['verified_reviews'] = lexicon['verified_reviews'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

Lemmatization

In [588]:
lemmatizer = WordNetLemmatizer()

In [589]:
def lemmatizer_text(text):
    tokens = word_tokenize(text)

    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(lemmatized_tokens)

In [590]:
lexicon['verified_reviews'] = lexicon['verified_reviews'].apply(lemmatizer_text)
lexicon

Unnamed: 0,verified_reviews,feedback
0,love echo !,1
1,loved it !,1
2,sometimes playing game answer question correct...,1
3,lot fun thing 4 yr old learns dinosaur control...,1
4,music,1
...,...,...
3145,perfect kid adult everyone between ! !,1
3146,listening music searching location checking ti...,1
3147,love thing running entire home tv light thermo...,1
3148,complaint sound quality isnt great mostly use ...,1


Applying VADER

In [591]:
sia = SentimentIntensityAnalyzer()

In [592]:
def compound_sentiment_score(text):
    text = sia.polarity_scores(text)
    return text['compound']

In [593]:
lexicon['compound_sentiment_score'] = lexicon['verified_reviews'].apply(compound_sentiment_score)
lexicon

Unnamed: 0,verified_reviews,feedback,compound_sentiment_score
0,love echo !,1,0.6696
1,loved it !,1,0.6360
2,sometimes playing game answer question correct...,1,0.0516
3,lot fun thing 4 yr old learns dinosaur control...,1,0.9169
4,music,1,0.0000
...,...,...,...
3145,perfect kid adult everyone between ! !,1,0.6467
3146,listening music searching location checking ti...,1,0.0000
3147,love thing running entire home tv light thermo...,1,0.9648
3148,complaint sound quality isnt great mostly use ...,1,0.5432


In [594]:
threshold = 0
lexicon['predicted_feedback'] = lexicon['compound_sentiment_score'].apply(lambda x: 1 if x >= threshold else 0)
lexicon

Unnamed: 0,verified_reviews,feedback,compound_sentiment_score,predicted_feedback
0,love echo !,1,0.6696,1
1,loved it !,1,0.6360,1
2,sometimes playing game answer question correct...,1,0.0516,1
3,lot fun thing 4 yr old learns dinosaur control...,1,0.9169,1
4,music,1,0.0000,1
...,...,...,...,...
3145,perfect kid adult everyone between ! !,1,0.6467,1
3146,listening music searching location checking ti...,1,0.0000,1
3147,love thing running entire home tv light thermo...,1,0.9648,1
3148,complaint sound quality isnt great mostly use ...,1,0.5432,1


In [595]:
accuracy = accuracy_score(lexicon['feedback'], lexicon['predicted_feedback'])
accuracy

0.912063492063492

Combining Lexicon method with machine learning

In [596]:
hybrid = lexicon[['verified_reviews', 'compound_sentiment_score', 'feedback']].copy()

In [597]:
hybrid

Unnamed: 0,verified_reviews,compound_sentiment_score,feedback
0,love echo !,0.6696,1
1,loved it !,0.6360,1
2,sometimes playing game answer question correct...,0.0516,1
3,lot fun thing 4 yr old learns dinosaur control...,0.9169,1
4,music,0.0000,1
...,...,...,...
3145,perfect kid adult everyone between ! !,0.6467,1
3146,listening music searching location checking ti...,0.0000,1
3147,love thing running entire home tv light thermo...,0.9648,1
3148,complaint sound quality isnt great mostly use ...,0.5432,1


Splitting the data

In [598]:
x = hybrid['verified_reviews']
y = hybrid['feedback']

In [599]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state= 100)
x_train

930                                             liked 100
1866                                     great new friend
1235                                 easy set use thought
2759    love it ! personally prefer spotify music grea...
2563                                   everything big guy
                              ...                        
1930             great device grandkids ball setting me !
79                                        love everything
1859                    great product fast shipping thank
2840                        small size work great sunroom
1544    love it ! easy setup drop daughter soninlaw li...
Name: verified_reviews, Length: 2520, dtype: object

Vectorize the reviews which would turn the text into numerical values and combine it with compound_sentiment_score 

In [600]:
vectorizer = TfidfVectorizer(min_df = 1)

In [601]:
x_train_vectorizer = vectorizer.fit_transform(x_train)
x_test_vectorizer = vectorizer.transform(x_test)
print(x_train_vectorizer)

  (0, 2)	0.7267654862396732
  (0, 1905)	0.6868856731733536
  (1, 1354)	0.7488881912086198
  (1, 2157)	0.5612286082053525
  (1, 1461)	0.3524044925936168
  (2, 3285)	0.6639410805772201
  (2, 3469)	0.40615451747966813
  (2, 2861)	0.4521297922566836
  (2, 1072)	0.4356597300723805
  (3, 1518)	0.20487687272234972
  (3, 2120)	0.1574111842703991
  (3, 384)	0.1559788838280952
  (3, 3543)	0.19725585669942855
  (3, 3091)	0.15209519578208577
  (3, 1029)	0.1574111842703991
  (3, 1939)	0.16815240307243523
  (3, 409)	0.20487687272234972
  (3, 1020)	0.11856629483728709
  (3, 566)	0.20487687272234972
  (3, 1960)	0.11789559812452666
  (3, 1529)	0.19725585669942855
  (3, 3621)	0.1913445374243557
  (3, 3628)	0.07566018616753512
  (3, 1790)	0.1913445374243557
  (3, 659)	0.19725585669942855
  :	:
  (2516, 1965)	0.4698325336136934
  (2517, 2880)	0.6079346248975462
  (2517, 3255)	0.5084049450538952
  (2517, 1265)	0.4813217565837088
  (2517, 2502)	0.30260381961936506
  (2517, 1461)	0.2206812150736201
  (2518, 

In [602]:
train_scores = hybrid.loc[x_train.index, 'compound_sentiment_score']
test_scores = hybrid.loc[x_test.index, 'compound_sentiment_score']

train_scores_array = np.array(train_scores).reshape(-1, 1)
test_scores_array = np.array(test_scores).reshape(-1, 1)

x_train_combined = hstack([x_train_vectorizer, train_scores_array])
x_test_combined = hstack([x_test_vectorizer, test_scores_array])

In [603]:
model = LogisticRegression()
model.fit(x_train_combined, y_train)

LogisticRegression()

Accuracy on the Lexicon Based Approach with Machine Learning

In [604]:
y_pred = model.predict(x_test_combined)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9301587301587302

Logistic Regression model from the original dataframe

In [605]:
x = df['verified_reviews']
y = df['feedback']

In [606]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)
x_train

930                                            Liked 100%
1866                         Is great having a new friend
1235    It was easy set up.  I use it more than I thou...
2759    Love it! I personally prefer Spotify music, so...
2563                 It does everything  the big guy does
                              ...                        
1930    Great device, my grandkids had a ball setting ...
79                          I love it. It does everything
1859           Great product and fast shipping. Thank you
2840               Small size works great in our sunroom.
1544    Love it! Easy setup and I can ‘Drop in’ on my ...
Name: verified_reviews, Length: 2520, dtype: object

In [607]:
vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
x_train_vectorized = vectorizer.fit_transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

In [608]:
model = LogisticRegression()
model.fit(x_train_vectorized, y_train)

LogisticRegression()

In [609]:
y_pred = model.predict(x_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9158730158730158