In [2]:
import pandas as pd
import numpy as np
import string
import sklearn
import datetime as dt
import matplotlib.pyplot as plt
import itertools

In [3]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [5]:
data = pd.read_excel('Ph_data.xlsx')
data.head()

Unnamed: 0,Phone Name,Customer Name,Review Title,Rating,Review
0,"Samsung Galaxy M21 (Midnight Blue, 4GB RAM, 64...",Rupa Ray,Best Budget Phone In India,5.0 out of 5 stars,"Brilliant camera, huge battery life and bril..."
1,"Samsung Galaxy M21 (Midnight Blue, 4GB RAM, 64...",Cv Vineeth,Value For Money !! Go for it,4.0 out of 5 stars,Review after 3 Day..1. Battery :- 5/5Two day...
2,"Samsung Galaxy M21 (Midnight Blue, 4GB RAM, 64...",Rakesh s.,❤️❤️❤️❤️,5.0 out of 5 stars,As a title it is obviously a monsterAnd very...
3,"Samsung Galaxy M21 (Midnight Blue, 4GB RAM, 64...",Ganapam Venkateswara Reddy,"More than 5star,best budget mobile in India.",5.0 out of 5 stars,"I love this one, and this is best in class, ..."
4,"Samsung Galaxy M21 (Midnight Blue, 4GB RAM, 64...",Ravi Kumar,Battery life is not upto mark,3.0 out of 5 stars,I am writing my review after using it for 6 ...


In [6]:
len(data) - len(data.dropna())


0

In [7]:
data = data.dropna()


In [8]:
data['Rating'] = [string[:1] for string in data['Rating']]
data["Rating"] = pd.to_numeric(data["Rating"])
data


Unnamed: 0,Phone Name,Customer Name,Review Title,Rating,Review
0,"Samsung Galaxy M21 (Midnight Blue, 4GB RAM, 64...",Rupa Ray,Best Budget Phone In India,5,"Brilliant camera, huge battery life and bril..."
1,"Samsung Galaxy M21 (Midnight Blue, 4GB RAM, 64...",Cv Vineeth,Value For Money !! Go for it,4,Review after 3 Day..1. Battery :- 5/5Two day...
2,"Samsung Galaxy M21 (Midnight Blue, 4GB RAM, 64...",Rakesh s.,❤️❤️❤️❤️,5,As a title it is obviously a monsterAnd very...
3,"Samsung Galaxy M21 (Midnight Blue, 4GB RAM, 64...",Ganapam Venkateswara Reddy,"More than 5star,best budget mobile in India.",5,"I love this one, and this is best in class, ..."
4,"Samsung Galaxy M21 (Midnight Blue, 4GB RAM, 64...",Ravi Kumar,Battery life is not upto mark,3,I am writing my review after using it for 6 ...
...,...,...,...,...,...
284,Apple iPhone 11 (64GB) - Purple,Amazon Customer,Worst Experience Ever.!,1,My Phone is Producing Too Much Heat Even Did...
285,Apple iPhone 11 (64GB) - Purple,Satyapal singh,iPhone 11,1,"Defective product,got heat up within 5 minut..."
286,Apple iPhone 11 (64GB) - Purple,Gurmeet singh,Defective Iphone 11,1,The product i got was defective . The face i...
287,Apple iPhone 11 (64GB) - Purple,Ghar Sayan,Best iPhone ever | Super Night Mode | Super Ca...,5,"In my opinion, don’t think much about the p..."


In [9]:
reviews = data['Review']

In [10]:
for i in range(0,len(data)-1):
    if type(data.iloc[i]['Review']) != str:
        data.iloc[i]['Review'] = str(data.iloc[i]['Review'])

In [11]:
stops = stopwords.words('english')

def tokenize(text):
    tokenized = word_tokenize(text)
    no_punc = []
    for review in tokenized:
        line = "".join(char for char in review if char not in string.punctuation)
        no_punc.append(line)
    tokens = lemmatize(no_punc)
    return tokens


def lemmatize(tokens):
    lmtzr = WordNetLemmatizer()
    lemma = [lmtzr.lemmatize(t) for t in tokens]
    return lemma

In [12]:
reviews = reviews.apply(lambda x: tokenize(x))


In [13]:
reviews

0      [Brilliant, camera, , huge, battery, life, and...
1      [Review, after, 3, Day, , 1, , Battery, , , 55...
2      [As, a, title, it, is, obviously, a, monsterAn...
3      [I, love, this, one, , and, this, is, best, in...
4      [I, am, writing, my, review, after, using, it,...
                             ...                        
284    [My, Phone, is, Producing, Too, Much, Heat, Ev...
285    [Defective, product, , got, heat, up, within, ...
286    [The, product, i, got, wa, defective, , The, f...
287    [In, my, opinion, , don, ’, t, think, much, ab...
288    [Bought, the, mobile, from, appario, retail, l...
Name: Review, Length: 289, dtype: object

In [14]:
def transform_sentiment(x):
    if x <= 2:
        return 'Negative'
    else:
        return 'Positive'

data['Sentiment'] = data['Rating'].apply(lambda x: transform_sentiment(x))

In [15]:
data['Phone Name'].nunique()

39

In [16]:
data

Unnamed: 0,Phone Name,Customer Name,Review Title,Rating,Review,Sentiment
0,"Samsung Galaxy M21 (Midnight Blue, 4GB RAM, 64...",Rupa Ray,Best Budget Phone In India,5,"Brilliant camera, huge battery life and bril...",Positive
1,"Samsung Galaxy M21 (Midnight Blue, 4GB RAM, 64...",Cv Vineeth,Value For Money !! Go for it,4,Review after 3 Day..1. Battery :- 5/5Two day...,Positive
2,"Samsung Galaxy M21 (Midnight Blue, 4GB RAM, 64...",Rakesh s.,❤️❤️❤️❤️,5,As a title it is obviously a monsterAnd very...,Positive
3,"Samsung Galaxy M21 (Midnight Blue, 4GB RAM, 64...",Ganapam Venkateswara Reddy,"More than 5star,best budget mobile in India.",5,"I love this one, and this is best in class, ...",Positive
4,"Samsung Galaxy M21 (Midnight Blue, 4GB RAM, 64...",Ravi Kumar,Battery life is not upto mark,3,I am writing my review after using it for 6 ...,Positive
...,...,...,...,...,...,...
284,Apple iPhone 11 (64GB) - Purple,Amazon Customer,Worst Experience Ever.!,1,My Phone is Producing Too Much Heat Even Did...,Negative
285,Apple iPhone 11 (64GB) - Purple,Satyapal singh,iPhone 11,1,"Defective product,got heat up within 5 minut...",Negative
286,Apple iPhone 11 (64GB) - Purple,Gurmeet singh,Defective Iphone 11,1,The product i got was defective . The face i...,Negative
287,Apple iPhone 11 (64GB) - Purple,Ghar Sayan,Best iPhone ever | Super Night Mode | Super Ca...,5,"In my opinion, don’t think much about the p...",Positive


In [20]:
x_train, x_test, y_train, y_test = train_test_split(data.Review, data.Sentiment, random_state=0)
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

x_train shape: (216,)
y_train shape: (216,)
x_test shape: (73,)
y_test shape: (73,)


In [23]:

vectorizer = CountVectorizer(min_df=5).fit(x_train)
X_train = vectorizer.transform(x_train)
feature_names = vectorizer.get_feature_names()
scores = cross_val_score(LogisticRegression(max_iter = 1000), X_train, y_train, cv=5)

In [33]:
logreg = LogisticRegression(C=0.1).fit(X_train, y_train)
X_test = vectorizer.transform(x_test)
log_y_pred = logreg.predict(X_test)

In [34]:
log_y_pred

array(['Positive', 'Negative', 'Negative', 'Positive', 'Negative',
       'Positive', 'Negative', 'Negative', 'Positive', 'Positive',
       'Negative', 'Positive', 'Positive', 'Negative', 'Negative',
       'Negative', 'Positive', 'Negative', 'Positive', 'Negative',
       'Positive', 'Negative', 'Positive', 'Positive', 'Positive',
       'Positive', 'Negative', 'Negative', 'Negative', 'Negative',
       'Positive', 'Positive', 'Positive', 'Negative', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Negative',
       'Positive', 'Positive', 'Positive', 'Positive', 'Negative',
       'Negative', 'Positive', 'Negative', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Negative', 'Positive',
       'Positive', 'Negative', 'Negative', 'Negative', 'Positive',
       'Positive', 'Positive', 'Positive', 'Negative', 'Negative',
       'Positive', 'Negative', 'Positive', 'Positive', 'Positive',
       'Negative', 'Positive', 'Positive'], dtype=object)

In [35]:
logreg_score = accuracy_score(y_test, log_y_pred)
print("Accuracy:", logreg_score)


Accuracy: 0.9315068493150684


In [36]:
print("Training set score:", logreg.score(X_train, y_train))
print("Test set score:", logreg.score(X_test, y_test))


Training set score: 0.9583333333333334
Test set score: 0.9315068493150684


In [37]:
log_cfm = confusion_matrix(y_test, log_y_pred)
print("Confusion matrix:")
print(log_cfm, end='\n\n')

print(np.array([['TN', 'FP'],[ 'FN' , 'TP']]))

Confusion matrix:
[[26  2]
 [ 3 42]]

[['TN' 'FP']
 ['FN' 'TP']]


In [38]:
print("Training set score: ", logreg.score(X_train, y_train))
print("Test set score:", logreg.score(X_test, y_test))


Training set score:  0.9583333333333334
Test set score: 0.9315068493150684


In [39]:
log_y_pred_prob = logreg.predict_proba(X_test)

In [40]:
log_y_pred_prob

array([[0.15711265, 0.84288735],
       [0.88326959, 0.11673041],
       [0.54538865, 0.45461135],
       [0.25545002, 0.74454998],
       [0.74941717, 0.25058283],
       [0.126427  , 0.873573  ],
       [0.69185633, 0.30814367],
       [0.66441502, 0.33558498],
       [0.22868782, 0.77131218],
       [0.19141601, 0.80858399],
       [0.66441502, 0.33558498],
       [0.16088774, 0.83911226],
       [0.05323196, 0.94676804],
       [0.78611539, 0.21388461],
       [0.66761704, 0.33238296],
       [0.74941717, 0.25058283],
       [0.16403899, 0.83596101],
       [0.98596644, 0.01403356],
       [0.17808487, 0.82191513],
       [0.72218994, 0.27781006],
       [0.10240622, 0.89759378],
       [0.95442364, 0.04557636],
       [0.17808487, 0.82191513],
       [0.33743569, 0.66256431],
       [0.05323196, 0.94676804],
       [0.47102951, 0.52897049],
       [0.93240866, 0.06759134],
       [0.8356768 , 0.1643232 ],
       [0.66441502, 0.33558498],
       [0.90039602, 0.09960398],
       [0.

In [None]:
log_y_pred