In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('amazon_cells_labelled.txt',delimiter ='\t',names=['message','output'])

In [3]:
df

Unnamed: 0,message,output
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1
...,...,...
995,The screen does get smudged easily because it ...,0
996,What a piece of junk.. I lose more calls on th...,0
997,Item Does Not Match Picture.,0
998,The only thing that disappoint me is the infra...,0


In [4]:
# remove punctuations
df.replace("[^a-z A-Z]"," ",regex=True,inplace=True)

In [5]:
df.head()

Unnamed: 0,message,output
0,So there is no way for me to plug it in here i...,0
1,Good case Excellent value,1
2,Great for the jawbone,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great,1


In [6]:
# convert all upper case to lower case
df['message']=df['message'].str.lower()

In [7]:
df.head()

Unnamed: 0,message,output
0,so there is no way for me to plug it in here i...,0
1,good case excellent value,1
2,great for the jawbone,1
3,tied to charger for conversations lasting more...,0
4,the mic is great,1


In [8]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\varshab1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
ps=PorterStemmer()
for i in range(len(df['message'])):
    review=df.values[i][0]
    review=review.split()
    review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    df.values[i][0]=review

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [11]:
# implement bag of words
cv=CountVectorizer(ngram_range=(2,2))
x=cv.fit_transform(df['message'])

In [12]:
x

<1000x5984 sparse matrix of type '<class 'numpy.int64'>'
	with 8578 stored elements in Compressed Sparse Row format>

In [13]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,df['output'])

In [14]:
x_train

<750x5984 sparse matrix of type '<class 'numpy.int64'>'
	with 6627 stored elements in Compressed Sparse Row format>

In [15]:
x_test

<250x5984 sparse matrix of type '<class 'numpy.int64'>'
	with 1951 stored elements in Compressed Sparse Row format>

In [16]:
# implement random forest algorithm
rfc=RandomForestClassifier(n_estimators=200,criterion='entropy')
rfc.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [17]:
y_pred=rfc.predict(x_test)

In [18]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [19]:
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[114  15]
 [ 50  71]]
0.74
              precision    recall  f1-score   support

           0       0.70      0.88      0.78       129
           1       0.83      0.59      0.69       121

    accuracy                           0.74       250
   macro avg       0.76      0.74      0.73       250
weighted avg       0.76      0.74      0.73       250



In [21]:
x_train=x_train.toarray()
x_test=x_test.toarray()
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(x_train,y_train).predict(x_test)

In [22]:
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[ 62  67]
 [ 15 106]]
0.672
              precision    recall  f1-score   support

           0       0.81      0.48      0.60       129
           1       0.61      0.88      0.72       121

    accuracy                           0.67       250
   macro avg       0.71      0.68      0.66       250
weighted avg       0.71      0.67      0.66       250



In [None]:
#################Passive Aggreasive classifier for text classification

In [26]:
from sklearn.linear_model import PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(x_train,y_train)
y_pred=pac.predict(x_test)


In [27]:
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[80 49]
 [24 97]]
0.708
              precision    recall  f1-score   support

           0       0.77      0.62      0.69       129
           1       0.66      0.80      0.73       121

    accuracy                           0.71       250
   macro avg       0.72      0.71      0.71       250
weighted avg       0.72      0.71      0.71       250

