In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score

In [4]:
data = pd.read_csv("text_classification.csv",names=["message","review"])
data.head()

Unnamed: 0,message,review
0,I love this sandwich,pos
1,this is an amazing place,pos
2,I feel very good about these beers,pos
3,this is my best work,pos
4,what an awesome view,pos


In [5]:
data.shape

(18, 2)

In [9]:
data["labelnum"]= data["review"].map({"pos":1,"neg":0})
data.head(7)

Unnamed: 0,message,review,labelnum
0,I love this sandwich,pos,1
1,this is an amazing place,pos,1
2,I feel very good about these beers,pos,1
3,this is my best work,pos,1
4,what an awesome view,pos,1
5,I do not like this restaurant,neg,0
6,I am tired of this stuff,neg,0


In [16]:
X = data["message"]
Y = data["labelnum"]
print(X.head(),"\n\n",Y.head())

0                  I love this sandwich
1              this is an amazing place
2    I feel very good about these beers
3                  this is my best work
4                  what an awesome view
Name: message, dtype: object 

 0    1
1    1
2    1
3    1
4    1
Name: labelnum, dtype: int64


# Splitting the data into train and test data


In [31]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=3)
print("Total no.of training data samples :",x_train.shape,"\nTotal no.of testing data samples :",x_test.shape)

Total no.of training data samples : (14,) 
Total no.of testing data samples : (4,)


# Extracting Words or tokens in text document 

In [37]:
cv = CountVectorizer()

x_train_training = cv.fit_transform(x_train)  # arg is training data
x_test_trans = cv.transform(x_test)           # arg is testing data
print("\nThe words or tokens in the text documents \n")
print(cv.get_feature_names_out())


CountVectorizer()

The words or tokens in the text documents 

['am' 'an' 'and' 'awesome' 'bad' 'best' 'boss' 'can' 'dance' 'deal' 'do'
 'enemy' 'fun' 'good' 'have' 'he' 'horrible' 'house' 'is' 'like'
 'locality' 'love' 'my' 'not' 'of' 'place' 'restaurant' 'sandwich' 'sick'
 'stay' 'stuff' 'sworn' 'that' 'this' 'tired' 'to' 'today' 'tomorrow'
 'view' 'we' 'went' 'what' 'will' 'with' 'work']


In [39]:
# help(cv)    for more description about CountVectorizer

In [33]:
df = pd.DataFrame(x_train_training.toarray(),columns=cv.get_feature_names_out())
print(df)

    am  an  and  awesome  bad  best  boss  can  dance  deal  ...  to  today  \
0    0   1    0        1    0     0     0    0      0     0  ...   0      0   
1    1   0    1        0    0     0     0    0      0     0  ...   0      0   
2    1   0    0        0    0     0     0    0      0     0  ...   0      0   
3    0   0    0        0    0     0     0    1      0     1  ...   0      0   
4    0   0    0        0    0     0     0    0      1     0  ...   1      0   
5    0   0    0        0    0     0     1    0      0     0  ...   0      0   
6    0   0    0        0    0     0     0    0      0     0  ...   1      1   
7    0   0    0        0    0     0     0    0      0     0  ...   0      0   
8    0   0    0        0    0     0     0    0      0     0  ...   0      0   
9    0   0    0        0    0     0     0    0      0     0  ...   0      0   
10   0   0    0        0    1     0     0    0      0     0  ...   1      0   
11   0   0    0        0    0     0     0    0      

# Training Naive bayes(NB) classifier on training data

In [34]:
clf = MultinomialNB().fit(x_train_training,y_train)
predicted = clf.predict(x_test_trans)

## Accuracy score, Confusion matrix,Precision score,Recall score.

In [35]:
print("\nAccuracy of classifier is :",accuracy_score(y_test,predicted))
print("\nConfusion Matrix :",confusion_matrix(y_test,predicted))
print("\nPrecision score is :",precision_score(y_test,predicted))
print("\nRecall score is :",recall_score(y_test,predicted))


Accuracy of classifier is : 1.0

Confusion Matrix : [[1 0]
 [0 3]]

Precision score is : 1.0

Recall score is : 1.0
