In [2]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/z4hid/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Data Processing

In [5]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [6]:
train_data.shape

(2520, 5)

In [7]:
test_data.shape

(630, 5)

In [8]:
train_data.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,30-Jul-18,Configuration: Fire TV Stick,Just the perfect cost effective solution. Tha...,1
1,5,30-Jul-18,Configuration: Fire TV Stick,I purchased the Fire Stick to be able to watch...,1
2,2,30-Jul-18,Configuration: Fire TV Stick,"""I’m very unhappy with this Firestick, every t...",0
3,4,30-Jul-18,White Dot,"""Handy if you don't expect much out of it much...",1
4,4,29-Jul-18,Charcoal Fabric,"""Good quality, great sound quality""",1


In [9]:
train_data.isnull().sum()

rating              0
date                0
variation           0
verified_reviews    1
feedback            0
dtype: int64

In [10]:
train_data.dropna(inplace=True)

In [11]:
train_data.isnull().sum()

rating              0
date                0
variation           0
verified_reviews    0
feedback            0
dtype: int64

In [12]:
train_data['feedback'].value_counts()

feedback
1    2321
0     198
Name: count, dtype: int64

## Stemming

Stemming is the process of reducing a word to its root word

example: actor, actress, acting ==> act

In [13]:
port_stem = PorterStemmer()

In [14]:
def stemming(text):
    stemmed_text = re.sub('[^a-zA-Z]', ' ', text)
    stemmed_text = stemmed_text.lower()
    stemmed_text = stemmed_text.split()
    stemmed_text = [port_stem.stem(word) for word in stemmed_text if word not in stopwords.words('english')]
    stemmed_text = ' '.join(stemmed_text)
    return stemmed_text

In [15]:
train_data['stemmed_content'] = train_data['verified_reviews'].apply(stemming)

In [16]:
train_data

Unnamed: 0,rating,date,variation,verified_reviews,feedback,stemmed_content
0,5,30-Jul-18,Configuration: Fire TV Stick,Just the perfect cost effective solution. Tha...,1,perfect cost effect solut thank amazon
1,5,30-Jul-18,Configuration: Fire TV Stick,I purchased the Fire Stick to be able to watch...,1,purchas fire stick abl watch movi conveni love...
2,2,30-Jul-18,Configuration: Fire TV Stick,"""I’m very unhappy with this Firestick, every t...",0,unhappi firestick everi time chang program cha...
3,4,30-Jul-18,White Dot,"""Handy if you don't expect much out of it much...",1,handi expect much much dumber assist phone
4,4,29-Jul-18,Charcoal Fabric,"""Good quality, great sound quality""",1,good qualiti great sound qualiti
...,...,...,...,...,...,...
2515,5,30-Jul-18,Black Dot,"""I love this technology. I'm older, but this i...",1,love technolog older easi set use dot bedroom ...
2516,5,30-Jul-18,White Spot,Love my Echo Spot!,1,love echo spot
2517,5,29-Jul-18,Black Spot,Hands free control,1,hand free control
2518,5,23-Jul-18,Black Spot,It has replaced my clock and Google Mini on my...,1,replac clock googl mini nightstand outstand li...


In [17]:
print(train_data['stemmed_content'])

0                  perfect cost effect solut thank amazon
1       purchas fire stick abl watch movi conveni love...
2       unhappi firestick everi time chang program cha...
3              handi expect much much dumber assist phone
4                        good qualiti great sound qualiti
                              ...                        
2515    love technolog older easi set use dot bedroom ...
2516                                       love echo spot
2517                                    hand free control
2518    replac clock googl mini nightstand outstand li...
2519                                             easi set
Name: stemmed_content, Length: 2519, dtype: object


In [18]:
train_data['feedback']

0       1
1       1
2       0
3       1
4       1
       ..
2515    1
2516    1
2517    1
2518    1
2519    1
Name: feedback, Length: 2519, dtype: int64

In [35]:
train_data['stemmed_content'].values

array(['perfect cost effect solut thank amazon',
       'purchas fire stick abl watch movi conveni love varieti genr also enjoy give abil watch tv show awesom',
       'unhappi firestick everi time chang program channel within program long delay roku work fine quick',
       ..., 'hand free control',
       'replac clock googl mini nightstand outstand littl devic hope updat give simpl face font color option clock',
       'easi set'], dtype=object)

In [36]:
train_data['stemmed_content']

0                  perfect cost effect solut thank amazon
1       purchas fire stick abl watch movi conveni love...
2       unhappi firestick everi time chang program cha...
3              handi expect much much dumber assist phone
4                        good qualiti great sound qualiti
                              ...                        
2515    love technolog older easi set use dot bedroom ...
2516                                       love echo spot
2517                                    hand free control
2518    replac clock googl mini nightstand outstand li...
2519                                             easi set
Name: stemmed_content, Length: 2519, dtype: object

In [19]:
X = train_data['stemmed_content'].values
Y = train_data['feedback'].values

In [20]:
X

array(['perfect cost effect solut thank amazon',
       'purchas fire stick abl watch movi conveni love varieti genr also enjoy give abil watch tv show awesom',
       'unhappi firestick everi time chang program channel within program long delay roku work fine quick',
       ..., 'hand free control',
       'replac clock googl mini nightstand outstand littl devic hope updat give simpl face font color option clock',
       'easi set'], dtype=object)

In [21]:
Y

array([1, 1, 0, ..., 1, 1, 1])

## Splitting Data

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2) 

In [24]:
X_train.shape

(2015,)

In [25]:
X_test.shape

(504,)

In [26]:
# Convert text to numerical data

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [29]:
print(X_train) # X_train

  (0, 685)	0.4026296860749002
  (0, 690)	0.6690970970445966
  (0, 645)	0.2431730572761003
  (0, 738)	0.5753827203111989
  (1, 645)	0.30422558139378236
  (1, 1370)	0.5104370005889041
  (1, 2239)	0.70048055252814
  (1, 613)	0.39525670087335607
  (2, 1148)	0.39670750291110257
  (2, 966)	0.7154174620331645
  (2, 874)	0.3057643397572471
  (2, 1219)	0.4871439014140258
  (3, 914)	1.0
  (4, 645)	0.3287289945532186
  (4, 1228)	0.5527677926826177
  (4, 1526)	0.4449066648208739
  (4, 1335)	0.36133361236585587
  (4, 1747)	0.5078199441587781
  (5, 914)	0.17333658405747393
  (5, 2110)	0.23973994987263683
  (5, 895)	0.2359783423641524
  (5, 1941)	0.20861654946089714
  (5, 716)	0.284399632143079
  (5, 736)	0.3105231795792092
  (5, 1201)	0.3637302412373467
  :	:
  (2012, 760)	0.24654008991748683
  (2012, 141)	0.2777219123084682
  (2012, 1882)	0.18551823927549013
  (2012, 1495)	0.1911554240821941
  (2012, 1225)	0.1729566039558981
  (2012, 28)	0.2002450260546203
  (2012, 888)	0.16766479798256106
  (2012,

In [30]:
print(X_test) # X_test

  (0, 576)	0.5594578249956028
  (0, 914)	0.23136982393691874
  (0, 1111)	0.4345864185622294
  (0, 1149)	0.6140559193139729
  (0, 1335)	0.2598940541438279
  (1, 516)	0.667861690801501
  (1, 914)	0.6165969888757298
  (1, 1590)	0.4168559886449317
  (2, 2)	0.35834644684574535
  (2, 28)	0.42657184511836654
  (2, 65)	0.2352762051018016
  (2, 99)	0.38151635302281695
  (2, 1747)	0.6643067175569093
  (2, 2249)	0.2177412447910843
  (3, 642)	0.8593457381002069
  (3, 2110)	0.5113950551276486
  (4, 6)	0.7215376634547428
  (4, 914)	0.2647861909276092
  (4, 1590)	0.3580222135041918
  (4, 2215)	0.5301808822919561
  (5, 864)	0.7381730988456666
  (5, 878)	0.619945465311366
  (5, 1228)	0.26602273620963274
  (6, 30)	0.1694146562928196
  (6, 110)	0.17048812522101578
  :	:
  (497, 1631)	0.4125561684432945
  (497, 1710)	0.4125561684432945
  (498, 65)	0.21862733600752582
  (498, 641)	0.2516058376074602
  (498, 962)	0.4600380093142591
  (498, 1659)	0.4072785449251193
  (498, 1825)	0.36898440475593286
  (498, 2

# Train Model

In [31]:
model = LogisticRegression(max_iter=1000)

In [32]:
model.fit(X_train, Y_train)

# Evaluate

In [33]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of training data : ', training_data_accuracy)

Accuracy score of training data :  0.9235732009925558


In [34]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of test data : ', test_data_accuracy)

Accuracy score of test data :  0.9246031746031746
