In [19]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

In [20]:
dataset = pd.read_csv('../data/train.tsv', sep='\t', header=0)
dataset.info()
dataset.Sentiment.value_counts()
print(dataset['Phrase'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB
0         A series of escapades demonstrating the adage ...
1         A series of escapades demonstrating the adage ...
2                                                  A series
3                                                         A
4                                                    series
                                ...                        
156055                                            Hearst 's
156056                            forced avuncular chortles
156057                                   avuncular chortles
156058                                            avun

In [21]:
from nltk.stem.porter import PorterStemmer

# Convert text to lowercase
dataset = dataset.dropna()
dataset['Phrase'] = dataset['Phrase'].str.strip().str.lower()

# Use English stemmer.
# stemmer = PorterStemmer()

# dataset['unstemmed'] = dataset['Phrase'].str.split()
# dataset['stemmedPhrase'] = dataset['unstemmed'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.
# dataset = dataset.drop(columns=['unstemmed']) # Get rid of the unstemmed column.
# dataset['Phrase'] = dataset['stemmedPhrase'].apply(lambda x: " ".join(x)) # Stem every word.
dataset.info()


token = RegexpTokenizer(r'[a-zA-Z]+')
cv = CountVectorizer(stop_words='english',ngram_range = (3,3),tokenizer = token.tokenize)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 6.0+ MB


In [22]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# Split into training and testing data
x = dataset['Phrase']
y = dataset['Sentiment']

x, x_test, y, y_test = train_test_split(x,y, stratify=y, test_size=0.25, random_state=5)
vec = CountVectorizer(stop_words='english',ngram_range = (1,2),tokenizer = token.tokenize)
x = vec.fit_transform(x)
x_test = vec.transform(x_test)

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x, y)
from sklearn import metrics
predicted = model.predict(x_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)
print(str('Accuracy {:04.2f}'.format(accuracy_score*100))+'%')

report_mnb_reg = classification_report(y_test, predicted, output_dict=True)
print("This is the report for MultinomialNB with CountVectorizer\n")
print(report_mnb_reg)

Accuracy 61.35%
This is the report for MultinomialNB with CountVectorizer

{'0': {'precision': 0.39909297052154197, 'recall': 0.39819004524886875, 'f1-score': 0.39864099660249147, 'support': 1768}, '1': {'precision': 0.48560830860534127, 'recall': 0.4800528014080375, 'f1-score': 0.48281457442100606, 'support': 6818}, '2': {'precision': 0.7267499504263335, 'recall': 0.7368685599396834, 'f1-score': 0.731774278084209, 'support': 19895}, '3': {'precision': 0.529440688383528, 'recall': 0.5232021379980564, 'f1-score': 0.526302926620639, 'support': 8232}, '4': {'precision': 0.45054446460980035, 'recall': 0.4313640312771503, 'f1-score': 0.44074567243675095, 'support': 2302}, 'accuracy': 0.6135332564398308, 'macro avg': {'precision': 0.518287276509309, 'recall': 0.5139355151743592, 'f1-score': 0.5160556896330193, 'support': 39015}, 'weighted avg': {'precision': 0.6118332228114871, 'recall': 0.6135332564398308, 'f1-score': 0.6126463806938731, 'support': 39015}}


In [14]:
import joblib
import pickle

# Save model
joblib.dump(model, '../model/bayes_multinomial_model.pkl')

file = open('../temp/vec_bayes', 'wb')
pickle.dump(vec, file)
file.close()

In [15]:
model.predict(vec.transform(['The Movie is AWesome']))

array([4])

In [17]:
model.predict(vec.transform(['The Movie is Not that good']))

array([3])

In [18]:
model.predict(vec.transform(['The smell is bad']))

array([1])

In [16]:
from sklearn.naive_bayes import BernoulliNB
model_bnb = BernoulliNB()

model_bnb.fit(x, y)
predicted_bnb = model_bnb.predict(x_test)
accuracy_score_bnb = metrics.accuracy_score(predicted_bnb, y_test)
print(str('{:04.2f}'.format(accuracy_score_bnb*100))+'%')
report_bnb = classification_report(y_test, predicted_bnb, output_dict=True)
print("This is the report for BernoulliNB\n")
print(report_bnb)

60.71%
This is the report for BernoulliNB

{'0': {'precision': 0.44457409568261375, 'recall': 0.2154977375565611, 'f1-score': 0.2902857142857143, 'support': 1768}, '1': {'precision': 0.5188916876574308, 'recall': 0.36256966852449396, 'f1-score': 0.42686927991711276, 'support': 6818}, '2': {'precision': 0.6451382358472822, 'recall': 0.8620758984669515, 'f1-score': 0.7379948364888124, 'support': 19895}, '3': {'precision': 0.5558505930253143, 'recall': 0.3814382896015549, 'f1-score': 0.4524169728405734, 'support': 8232}, '4': {'precision': 0.4672413793103448, 'recall': 0.23544743701129453, 'f1-score': 0.31311380704794917, 'support': 2302}, 'accuracy': 0.6070998333974112, 'macro avg': {'precision': 0.5263391983045972, 'recall': 0.4114058062321712, 'f1-score': 0.44413612211603243, 'support': 39015}, 'weighted avg': {'precision': 0.5846517356594567, 'recall': 0.6070998333974112, 'f1-score': 0.5780113205055727, 'support': 39015}}


In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
text_count_2 = tfidf.fit_transform(dataset['Phrase'])
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(text_count_2, dataset['Sentiment'],test_size=0.25,random_state=5)

MNB = MultinomialNB()
MNB.fit(x_train_tfidf, y_train_tfidf)
predicted_mnb = MNB.predict(x_test_tfidf)
accuracy_score_mnb = metrics.accuracy_score(MNB.predict(x_test_tfidf), y_test_tfidf)
print('accuracy_score_mnb = '+str('{:4.2f}'.format(accuracy_score_mnb*100))+'%')

report_bnb = classification_report(y_test, predicted_mnb, output_dict=True)
print("This is the report for MultinomialNB using TfidfVectorizer\n")
print(report_bnb)

accuracy_score_mnb = 57.95%
This is the report for MultinomialNB using TfidfVectorizer

{'0': {'precision': 0.057692307692307696, 'recall': 0.003393665158371041, 'f1-score': 0.00641025641025641, 'support': 1768}, '1': {'precision': 0.1711193777477173, 'recall': 0.07421531240833089, 'f1-score': 0.10352941176470588, 'support': 6818}, '2': {'precision': 0.5100819052004868, 'recall': 0.7794420708720784, 'f1-score': 0.6166295530459679, 'support': 19895}, '3': {'precision': 0.21150278293135436, 'recall': 0.13848396501457727, 'f1-score': 0.16737630303920129, 'support': 8232}, '4': {'precision': 0.0736196319018405, 'recall': 0.005212858384013901, 'f1-score': 0.00973630831643002, 'support': 2302}, 'accuracy': 0.4401127771369986, 'macro avg': {'precision': 0.20480320109474132, 'recall': 0.2001495743674743, 'f1-score': 0.18073636651531227, 'support': 39015}, 'weighted avg': {'precision': 0.3415951486140348, 'recall': 0.4401127771369986, 'f1-score': 0.36871194486627595, 'support': 39015}}
