In [None]:
import pandas as pd
import numpy as np

from nltk.stem.snowball import SnowballStemmer

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

In [None]:
df = pd.read_csv('./datasets/cleaned_reviews.csv')

df

Unnamed: 0,text,label
0,product is good but price is to high after pur...,2
1,not gudpoor cloth,0
2,did not show correct reading even once,0
3,4 star for short power chord thanks flipkart f...,2
4,issue is mop road rotatingits off worst qualit...,0
...,...,...
17365,very bad device,0
17366,product is gooddelivery person is polite alsot...,1
17367,value for money average,1
17368,beautiful,1


In [None]:
X = df["text"]

Y = df["label"]

In [None]:
print(f'X: {X.shape}')
print(f'Y: {Y.shape}')

X: (17370,)
Y: (17370,)


In [None]:
X.sample(10)

8144      its nice product in resonable price working good
14501                       prize is high but nice product
59       a bit of heating issues at the ultra setting a...
15105                                           bad coltey
12944    in add video it is rotational but in actual it...
15544                                         not valuable
176      soundbar is very nice clear voice good bass bu...
5487     it is just like a normal sock it doesnt do you...
6790     most cheapest product i have ever received nev...
12302    it looks simple design nothing is special in t...
Name: text, dtype: object

In [None]:
Y.sample(10)

3035     0
2617     0
3691     0
15555    2
1909     0
10193    0
6159     0
7070     2
6220     2
3340     1
Name: label, dtype: int64

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((13896,), (3474,), (13896,), (3474,))

In [None]:
vectorizer = CountVectorizer()

feature_vector = vectorizer.fit_transform(x_train)

feature_vector

<13896x12871 sparse matrix of type '<class 'numpy.int64'>'
	with 147339 stored elements in Compressed Sparse Row format>

In [None]:
X_dense = feature_vector.todense()

X_dense

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [None]:
X_dense.shape

(13896, 12871)

In [None]:
def summarize_classification(y_test, y_pred):

    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    acc = accuracy_score(y_test, y_pred, normalize=True)

    prec = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    print("Length of testing data: ", len(y_test))

    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)

    print("precision_score : " , prec)
    print("recall_score : ", recall)

In [None]:
clf = GaussianNB()

clf

In [None]:
clf.fit(np.asarray(X_dense), y_train)

In [None]:
feature_vector_test = vectorizer.transform(x_test)

X_dense_test = feature_vector_test.todense()

X_dense_test.shape

(3474, 12871)

In [None]:
y_pred = clf.predict(np.asarray(X_dense_test))

y_pred

array([0, 1, 1, ..., 1, 1, 0])

In [None]:
summarize_classification(y_test, y_pred)

Length of testing data:  3474
accuracy_count :  1391
accuracy_score :  0.40040299366724236
precision_score :  0.4678685235364118
recall_score :  0.40040299366724236


### Convert operations to a pipeline

In [None]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('to_dense', FunctionTransformer(lambda x: np.asarray(x.todense()), validate=False)),
    ('clf', GaussianNB())
])

pipeline

In [None]:
pipeline.fit(x_train, y_train)

In [None]:
y_pred = pipeline.predict(x_test)

y_pred

array([0, 1, 1, ..., 1, 1, 0])

In [None]:
summarize_classification(y_test, y_pred)

Length of testing data:  3474
accuracy_count :  1391
accuracy_score :  0.40040299366724236
precision_score :  0.4678685235364118
recall_score :  0.40040299366724236


### Stopword removal

In [None]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('to_dense', FunctionTransformer(lambda x: np.asarray(x.todense()), validate=False)),
    ('clf', GaussianNB())
])

pipeline

In [None]:
pipeline.fit(x_train, y_train)

In [None]:
y_pred = pipeline.predict(x_test)

y_pred

array([0, 1, 1, ..., 1, 1, 0])

In [None]:
summarize_classification(y_test, y_pred)

Length of testing data:  3474
accuracy_count :  1384
accuracy_score :  0.3983880253310305
precision_score :  0.4643989151333965
recall_score :  0.3983880253310305


### Stemming + Count Vectorization

In [None]:
stemmer =  SnowballStemmer('english')
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

In [None]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(analyzer = stemmed_words)),
    ('to_dense', FunctionTransformer(lambda x: np.asarray(x.todense()), validate=False)),
    ('clf', GaussianNB())
])

pipeline

In [None]:
pipeline.fit(x_train, y_train)

y_pred = pipeline.predict(x_test)

y_pred

array([0, 1, 1, ..., 1, 1, 0])

In [None]:
summarize_classification(y_test, y_pred)

Length of testing data:  3474
accuracy_count :  1377
accuracy_score :  0.3963730569948187
precision_score :  0.47457291405399143
recall_score :  0.3963730569948187
