In [14]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

In [15]:
dataset = pd.read_csv('./data/train.tsv', sep='\t', header=0)
dataset.info()
dataset.Sentiment.value_counts()
print(dataset['Phrase'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB
0         A series of escapades demonstrating the adage ...
1         A series of escapades demonstrating the adage ...
2                                                  A series
3                                                         A
4                                                    series
                                ...                        
156055                                            Hearst 's
156056                            forced avuncular chortles
156057                                   avuncular chortles
156058                                            avun

In [68]:
from nltk.stem.porter import PorterStemmer

# Convert text to lowercase
dataset['Phrase'] = dataset['Phrase'].str.strip().str.lower()
    
# Use English stemmer.
# stemmer = PorterStemmer()

# dataset['unstemmed'] = dataset['Phrase'].str.split()
# dataset['stemmedPhrase'] = dataset['unstemmed'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.
# dataset = dataset.drop(columns=['unstemmed']) # Get rid of the unstemmed column.
# dataset['stemmedPhrase'] = dataset['stemmedPhrase'].apply(lambda x: " ".join(x)) # Stem every word.
dataset.info()


token = RegexpTokenizer(r'[a-zA-Z]+')
cv = CountVectorizer(stop_words='english',ngram_range = (2,2),tokenizer = token.tokenize)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   PhraseId       156060 non-null  int64 
 1   SentenceId     156060 non-null  int64 
 2   Phrase         156060 non-null  object
 3   Sentiment      156060 non-null  int64 
 4   stemmedPhrase  156060 non-null  object
dtypes: int64(3), object(2)
memory usage: 6.0+ MB


In [83]:
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in text.split()]


print(dataset['Phrase'].apply(lemmatize_text))

0         [a, s, e, r, i, e, s, o, f, e, s, c, a, p, a, ...
1         [a, s, e, r, i, e, s, o, f, e, s, c, a, p, a, ...
2                                     [a, s, e, r, i, e, s]
3                                                       [a]
4                                        [s, e, r, i, e, s]
                                ...                        
156055                             [h, e, a, r, s, t, ', s]
156056    [f, o, r, c, e, d, a, v, u, n, c, u, l, a, r, ...
156057    [a, v, u, n, c, u, l, a, r, c, h, o, r, t, l, ...
156058                          [a, v, u, n, c, u, l, a, r]
156059                             [c, h, o, r, t, l, e, s]
Name: Phrase, Length: 156060, dtype: object


In [59]:
from sklearn.model_selection import train_test_split
dataset['Phrase'] = dataset['Phrase'].str.strip().str.lower()
# Split into training and testing data
x = dataset['Phrase']
y = dataset['Sentiment']

x, x_test, y, y_test = train_test_split(x,y, stratify=y, test_size=0.25, random_state=5)
vec = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
x = vec.fit_transform(x)
x_test = vec.transform(x_test)

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x, y)
from sklearn import metrics
predicted = model.predict(x_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)
print(str('{:04.2f}'.format(accuracy_score*100))+'%')

60.93%


In [65]:
import joblib

# Save model
joblib.dump(model, './model/bayes_multinomial_model.pkl')

  (0, 7079)	1


In [67]:
model.predict(vec.transform(['what a joke']))

ValueError: Expected 2D array, got 1D array instead:
array=['what a joke'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [60]:
from sklearn.naive_bayes import BernoulliNB
model_bnb = BernoulliNB()

model_bnb.fit(x, y)
predicted_bnb = model_bnb.predict(x_test)
accuracy_score_bnb = metrics.accuracy_score(predicted_bnb, y_test)
print(str('{:04.2f}'.format(accuracy_score_bnb*100))+'%')

60.72%


In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
text_count_2 = tfidf.fit_transform(dataset['Phrase'])
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(text_count_2, dataset['Sentiment'],test_size=0.25,random_state=5)

MNB.fit(x_train_tfidf, y_train_tfidf)
accuracy_score_mnb = metrics.accuracy_score(MNB.predict(x_test_tfidf), y_test_tfidf)
print('accuracy_score_mnb = '+str('{:4.2f}'.format(accuracy_score_mnb*100))+'%')

accuracy_score_mnb = 58.53%
