**Mounting my drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


**Importing dataset**

In [0]:
import pandas as pd
df=pd.read_csv('/content/drive/My Drive/Colab Notebooks/Task 2/consumer_reviews (1).csv')

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

**Dropping unwanted columns**

In [0]:
df.drop(['sub-categories','primary-categories','title'],axis = 1,inplace = True)

In [5]:

df

Unnamed: 0,rating,reviews
0,3,I order 3 of them and one of the item is bad q...
1,4,Bulk is always the less expensive way to go fo...
2,5,Well they are not Duracell but for the price i...
3,5,Seem to work as well as name brand batteries a...
4,5,These batteries are very long lasting the pric...
5,5,Bought a lot of batteries for Christmas and th...
6,5,ive not had any problame with these batteries ...
7,5,Well if you are looking for cheap non-recharge...
8,3,These do not hold the amount of high power jui...
9,4,AmazonBasics AA AAA batteries have done well b...


In [6]:
df['reviews'][5]

"Bought a lot of batteries for Christmas and the AmazonBasics Cell have been good. I haven't noticed a difference between the brand name batteries and the Amazon Basic brand. Just a lot easier to purchase and have arrive at the house and have on hand. Will buy again."

In [7]:
df.groupby('rating').describe()

Unnamed: 0_level_0,reviews,reviews,reviews,reviews
Unnamed: 0_level_1,count,unique,top,freq
rating,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,965,741,Bought this mostly as a backup.and to read a f...,4
2,616,458,Slow processor equates to slow internet browsi...,3
3,1206,822,These are great tablets for the kids and the p...,6
4,5648,3465,good,12
5,19897,12807,good,48





**Funtion for removing unwanted words from reviews**

In [0]:
def process_reviews(unprocessed):
    rmvpunc = [char for char in unprocessed if char not in string.punctuation]
    rmvpunc = ''.join(rmvpunc)
    return [word for word in rmvpunc.split() if word.lower() not in stopwords.words('english')]

In [9]:
 >>> import nltk
 >>> nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True






**Example**

In [10]:

df['reviews'].head(10).apply(process_reviews)

0    [order, 3, one, item, bad, quality, missing, b...
1    [Bulk, always, less, expensive, way, go, produ...
2                       [Well, Duracell, price, happy]
3    [Seem, work, well, name, brand, batteries, muc...
4             [batteries, long, lasting, price, great]
5    [Bought, lot, batteries, Christmas, AmazonBasi...
6    [ive, problame, batteries, ordered, past, plea...
7    [Well, looking, cheap, nonrechargeable, batter...
8    [hold, amount, high, power, juice, like, energ...
9    [AmazonBasics, AA, AAA, batteries, done, well,...
Name: reviews, dtype: object





**Converting text to word count vectors with CountVectorizer**

In [0]:
cvect_transformer = CountVectorizer(analyzer=process_reviews).fit(df['reviews'])

In [12]:

print(len(cvect_transformer.vocabulary_))

15281


In [13]:
message4 = df['reviews'][5]
print(message4)

Bought a lot of batteries for Christmas and the AmazonBasics Cell have been good. I haven't noticed a difference between the brand name batteries and the Amazon Basic brand. Just a lot easier to purchase and have arrive at the house and have on hand. Will buy again.


In [14]:

bow4 = cvect_transformer.transform([message4])
print(bow4)
print(bow4.shape)

  (0, 745)	1
  (0, 750)	1
  (0, 932)	1
  (0, 1018)	1
  (0, 1142)	1
  (0, 1177)	1
  (0, 4641)	1
  (0, 4893)	2
  (0, 5266)	2
  (0, 5425)	1
  (0, 6690)	1
  (0, 7076)	1
  (0, 8242)	1
  (0, 8466)	1
  (0, 8551)	1
  (0, 8769)	1
  (0, 9797)	2
  (0, 10395)	1
  (0, 10617)	1
  (0, 11748)	1
(1, 15281)


In [0]:
reviews_bow = cvect_transformer.transform(df['reviews'])

In [16]:
print(reviews_bow.shape)

(28332, 15281)


**Converting text to word frequency using tfidf**

In [0]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(reviews_bow)

In [18]:
reviews_bow[1:2]

<1x15281 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [19]:
reviews_tfidf = tfidf_transformer.transform(reviews_bow)
print(reviews_tfidf.shape)

(28332, 15281)


In [20]:
print(tfidf_transformer.idf_[cvect_transformer.vocabulary_['good']])
print(tfidf_transformer.idf_[cvect_transformer.vocabulary_['worst']])

2.95548536938675
7.588220835899611


In [21]:
reviews_tfidf.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [0]:
dftran=pd.DataFrame(data=reviews_tfidf.toarray())

In [23]:
dftran

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,15241,15242,15243,15244,15245,15246,15247,15248,15249,15250,15251,15252,15253,15254,15255,15256,15257,15258,15259,15260,15261,15262,15263,15264,15265,15266,15267,15268,15269,15270,15271,15272,15273,15274,15275,15276,15277,15278,15279,15280
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
df['rating'] = df['rating'].astype(object)


**DIviding the ratings into positive negative or neutral**

In [0]:
n=len(df.index)
for i in range(n):
  if df['rating'][i] == 1 or df['rating'][i] == 2:
    df['rating'][i] = "negative"
  if df['rating'][i] == 3:
    df['rating'][i] = "neutral"
  if df['rating'][i] == 4 or df['rating'][i] == 5:
    df['rating'][i] = "positive"
    

# Traning and testing in the same consumer review dataset

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(reviews_tfidf, df['rating'], test_size=0.25, random_state = 0)

In [27]:
y_train.describe()

count        21249
unique           3
top       positive
freq         19153
Name: rating, dtype: object

In [0]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train, y_train)

In [0]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [30]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
predictions = clf.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
pscore = metrics.accuracy_score(y_test, predictions)
print("Accuracy =",pscore)

[[   3    0  386]
 [   1    1  300]
 [   3    2 6387]]
              precision    recall  f1-score   support

    negative       0.43      0.01      0.02       389
     neutral       0.33      0.00      0.01       302
    positive       0.90      1.00      0.95      6392

    accuracy                           0.90      7083
   macro avg       0.55      0.34      0.32      7083
weighted avg       0.85      0.90      0.86      7083

Accuracy = 0.902301284766342


In [31]:
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10, random_state=None)
for train_index, test_index in skf.split(reviews_tfidf, df['rating']): 
    X_train, X_test = reviews_tfidf[train_index], reviews_tfidf[test_index] 
    y_train, y_test = df['rating'][train_index], df['rating'][test_index]
    
    clf1 = MultinomialNB().fit(X_train, y_train)
    predictions1 = clf1.predict(X_test)
    cm=confusion_matrix(y_test,predictions1)
    print(cm)
    print(classification_report(y_test,predictions1))
    precision = np.diag(cm) / np.sum(cm, axis = 0)
    print(precision)

[[   6    0  153]
 [   0    0  121]
 [   2    1 2552]]
              precision    recall  f1-score   support

    negative       0.75      0.04      0.07       159
     neutral       0.00      0.00      0.00       121
    positive       0.90      1.00      0.95      2555

    accuracy                           0.90      2835
   macro avg       0.55      0.35      0.34      2835
weighted avg       0.86      0.90      0.86      2835

[0.75       0.         0.90304317]
[[   7    0  151]
 [   0    0  121]
 [   1    1 2553]]
              precision    recall  f1-score   support

    negative       0.88      0.04      0.08       158
     neutral       0.00      0.00      0.00       121
    positive       0.90      1.00      0.95      2555

    accuracy                           0.90      2834
   macro avg       0.59      0.35      0.34      2834
weighted avg       0.86      0.90      0.86      2834

[0.875      0.         0.90371681]
[[   6    0  152]
 [   2    0  119]
 [   7    0 2548]]
   

  'precision', 'predicted', average, warn_for)
  del sys.path[0]
  'precision', 'predicted', average, warn_for)
  del sys.path[0]


              precision    recall  f1-score   support

    negative       0.22      0.01      0.02       158
     neutral       0.00      0.00      0.00       121
    positive       0.90      1.00      0.95      2555

    accuracy                           0.90      2834
   macro avg       0.38      0.34      0.32      2834
weighted avg       0.83      0.90      0.86      2834

[0.22222222 0.         0.9029745 ]
[[   5    0  153]
 [   0    0  121]
 [   0    1 2553]]
              precision    recall  f1-score   support

    negative       1.00      0.03      0.06       158
     neutral       0.00      0.00      0.00       121
    positive       0.90      1.00      0.95      2554

    accuracy                           0.90      2833
   macro avg       0.63      0.34      0.34      2833
weighted avg       0.87      0.90      0.86      2833

[1.         0.         0.90307747]
[[   5    0  153]
 [   0    0  120]
 [   2    0 2552]]
              precision    recall  f1-score   support

   

  'precision', 'predicted', average, warn_for)
  del sys.path[0]
  'precision', 'predicted', average, warn_for)
  del sys.path[0]
  'precision', 'predicted', average, warn_for)


[[   2    0  156]
 [   1    0  119]
 [   0    0 2554]]
              precision    recall  f1-score   support

    negative       0.67      0.01      0.02       158
     neutral       0.00      0.00      0.00       120
    positive       0.90      1.00      0.95      2554

    accuracy                           0.90      2832
   macro avg       0.52      0.34      0.32      2832
weighted avg       0.85      0.90      0.86      2832

[0.66666667        nan 0.90279251]
[[   1    0  157]
 [   0    0  120]
 [   2    0 2552]]
              precision    recall  f1-score   support

    negative       0.33      0.01      0.01       158
     neutral       0.00      0.00      0.00       120
    positive       0.90      1.00      0.95      2554

    accuracy                           0.90      2832
   macro avg       0.41      0.34      0.32      2832
weighted avg       0.83      0.90      0.86      2832

[0.33333333        nan 0.90208554]
[[   2    0  156]
 [   0    0  120]
 [   2    1 2551]]
   

  del sys.path[0]


In [0]:
review_model = MultinomialNB().fit(reviews_tfidf, df['rating'])

In [33]:
input1=["the best product i ever used"]
input2=["terrible worst battery ever used"]
print(review_model.predict(cvect_transformer.transform(input1)))
print(review_model.predict(cvect_transformer.transform(input2)))

['positive']
['negative']


**For API**

In [0]:
import pickle
name="picklenewfile"
pickle.dump(review_model,open(name,'wb'))

# Importing the validation dataset for testing

In [0]:
val2=pd.read_csv('/content/drive/My Drive/Colab Notebooks/Task 2/validation.csv')
val2.drop(['reviews.title'],axis=1,inplace=True)

# For binary classification (Positive or Negative)

**Importing the consumer review dataset for training**

In [0]:
df2=pd.read_csv('/content/drive/My Drive/Colab Notebooks/Task 2/consumer_reviews (1).csv')

In [0]:
df2.drop(['sub-categories','primary-categories','title'],axis = 1,inplace = True)

**Filling null values**

In [0]:
val2['reviews.rating']=val2['reviews.rating'].ffill(axis = 0)

In [0]:
val2['reviews.rating']=val2['reviews.rating'].astype(object)

**Converting ratings as positive or negative of validation set**

In [0]:
n1=len(val2.index)
for i in range(n1):
  if val2['reviews.rating'][i] == 1 or val2['reviews.rating'][i] == 2 or val2['reviews.rating'][i] == 3:
    val2['reviews.rating'][i] = "negative"
  if val2['reviews.rating'][i] == 4 or val2['reviews.rating'][i] == 5:
    val2['reviews.rating'][i] = "positive"

In [0]:
val2['reviews.rating']=val2['reviews.rating'].astype(str)

In [42]:
val2.groupby('reviews.rating').describe()

Unnamed: 0_level_0,reviews.text,reviews.text,reviews.text,reviews.text
Unnamed: 0_level_1,count,unique,top,freq
reviews.rating,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
negative,1343,1343,Nothing but problems after a month. Luckily I ...,1
positive,18657,18657,"Love it, easy to operate. Very reasonable pric...",1


In [43]:
df2.groupby('rating').describe()

Unnamed: 0_level_0,reviews,reviews,reviews,reviews
Unnamed: 0_level_1,count,unique,top,freq
rating,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,965,741,Bought this mostly as a backup.and to read a f...,4
2,616,458,Slow processor equates to slow internet browsi...,3
3,1206,822,These are great tablets for the kids and the p...,6
4,5648,3465,good,12
5,19897,12807,good,48


In [44]:
val2['reviews.rating'].isnull().sum()

0

**Converting ratings as positive or negative**

In [45]:
n=len(df2.index)
for i in range(n):
  if df2['rating'][i] == 1 or df2['rating'][i] == 2 or df2['rating'][i] == 3:
    df2['rating'][i] = "negative"
  if df2['rating'][i] == 4 or df2['rating'][i] == 5:
    df2['rating'][i] = "positive"
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


**Creating a pipeline which will preprocess the reviews into bag of words then token integer counts then into tfidf vectors and will train it into naive bayes classifier in a single line of code**

In [0]:
from sklearn.pipeline import Pipeline
pipeline1 = Pipeline([
    ('bow', CountVectorizer(analyzer=process_reviews)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

**Distributing the Consumer review dataset into train and validation datset into test**

In [0]:
xtrain2 = df2['reviews']
ytrain2 = df2['rating']
xtest2 = val2['reviews.text']
ytest2 = val2['reviews.rating']

**Training the consumer review dataset**

In [48]:
pipeline1.fit(xtrain2,ytrain2)

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function process_reviews at 0x7fd612034598>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

**Predicting the validation dataset ratings**

In [0]:
predictions2 = pipeline1.predict(xtest2)

**Calculating the accuracy of validation dataset**

In [50]:
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print(confusion_matrix(ytest2,predictions2))
print(classification_report(ytest2,predictions2))
pscore = metrics.accuracy_score(ytest2, predictions2)
print("Accuracy =",pscore*100)

[[   19  1324]
 [    8 18649]]
              precision    recall  f1-score   support

    negative       0.70      0.01      0.03      1343
    positive       0.93      1.00      0.97     18657

    accuracy                           0.93     20000
   macro avg       0.82      0.51      0.50     20000
weighted avg       0.92      0.93      0.90     20000

Accuracy = 93.34


# **Accuracy = 93.34%**