# Sentiment Analysis (the simple one)

In [4]:
import pandas as pd
import numpy as np

#### loading data: 

In [5]:
a=pd.read_csv('c:/1Python/textdata/Amazon_Unlocked_Mobile.csv')
len(a)

413840

In [6]:
df=a.sample(frac=0.1, random_state=10)
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
394349,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,Very good one! Better than Samsung S and iphon...,0.0
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0


## Preprocessing 

#### Removing missing values and neutral ratings  

In [7]:
df.dropna(inplace=True)

In [8]:
df=df[df['Rating']!=3]
len(df)

30737

Positive vs. negative ratings

In [9]:
df['PosOrNeg']=np.where(df['Rating']<3,0,1)
df[['Rating', 'PosOrNeg']].head()

Unnamed: 0,Rating,PosOrNeg
34377,1,0
248521,5,1
167661,1,0
73287,5,1
277158,5,1


In [10]:
print('Positive rated: ', '{:.2f}'.format(df['PosOrNeg'].mean()*100,), ' %', sep='')

Positive rated: 74.72 %


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(df['Reviews'], df['PosOrNeg'])

In [12]:
X_train.shape

(23052,)

#### Count Vectorizer 

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
vect=CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

In [14]:
len(vect.get_feature_names())

28939

In [15]:
vect.get_feature_names()[::5000]

['00', 'carefully', 'guarantee', 'music and', 'replaced it', 'to data']

making a matrix (scipy sparse row format)

In [16]:
X_vect=vect.transform(X_train)

In [17]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(X_vect, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
from sklearn.metrics import roc_auc_score

test_vect=vect.transform(X_test)
predictions=lr.predict(test_vect)
print('AUC score: ', roc_auc_score(predictions, y_test), sep='')
print('Accuracy: ', lr.score(test_vect, y_test))

AUC score: 0.927275166411344
Accuracy:  0.943786597267404


##### What are the most popular negative and positive words:

In [19]:
words=np.array(vect.get_feature_names())

In [20]:
coef=lr.coef_[0].argsort()
print ('negative words: \n', words[coef[:10]])
print('\npositive words:\n', words[coef[-10:]])

negative words: 
 ['no good' 'junk' 'poor' 'broken' 'slow' 'terrible' 'not good' 'worst'
 'sucks' 'garbage']

positive words:
 ['good' 'not bad' 'awesome' 'no problems' 'love' 'great' 'perfect'
 'excelent' 'excellent' 'excelente']
