# Sentiment Analysis using Term frequency-inverse document frequency 

In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

#### loading data: 

In [28]:
df=pd.read_csv('c:/1Python/textdata/Amazon_Unlocked_Mobile.csv')
len(df)

413840

In [29]:
#df=a.sample(frac=0.1, random_state=50)
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


## Preprocessing 

#### Removing missing values and neutral ratings  

In [30]:
df.dropna(inplace=True)

In [31]:
df=df[df['Rating']!=3]
len(df)

308277

Positive vs. negative ratings

In [32]:
df['PosOrNeg']=np.where(df['Rating']<3,0,1)
df[['Rating', 'PosOrNeg']].head()

Unnamed: 0,Rating,PosOrNeg
0,5,1
1,4,1
2,5,1
3,4,1
4,4,1


In [33]:
print('Positive rated: ', '{:.2f}'.format(df['PosOrNeg'].mean()*100,), ' %', sep='')

Positive rated: 74.83 %


In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(df['Reviews'], df['PosOrNeg'])

In [35]:
X_train.shape

(231207,)

#### Count Vectorizer 

In [36]:
vect=TfidfVectorizer(min_df=5).fit(X_train)

In [37]:
len(vect.get_feature_names())

18031

In [39]:
vect.get_feature_names()[::2000]

['00',
 'aviso',
 'contenido',
 'errors',
 'hi',
 'mariana',
 'plastics',
 'saves',
 'thatbut',
 'zf2']

making a matrix (scipy sparse row format)

In [40]:
X_vect=vect.transform(X_train)

In [41]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(X_vect, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [47]:
from sklearn.metrics import roc_auc_score

test_vect=vect.transform(X_test)
predictions=lr.predict(test_vect)
print('AUC score (test set): ', roc_auc_score(predictions, y_test), sep='')
print('Accuracy (test_set): {:.2f}'.format((lr.score(test_vect, y_test))*100), ' %', sep='')

AUC score (test set): 0.937095574031449
Accuracy (test_set): 94.93 %


##### What are the most popular negative and positive words:

In [48]:
words=np.array(vect.get_feature_names())

In [49]:
coef=lr.coef_[0].argsort()
print ('negative words: \n', words[coef[:10]])
print('\npositive words:\n', words[coef[-10:]])

negative words: 
 ['not' 'worst' 'useless' 'disappointed' 'waste' 'poor' 'horrible'
 'terrible' 'return' 'returning']

positive words:
 ['perfectly' 'best' 'loves' 'easy' 'awesome' 'perfect' 'amazing'
 'excellent' 'great' 'love']
