In [508]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
alt.data_transformers.enable('csv')

# Code for hiding seaborn warnings
import warnings
warnings.filterwarnings("ignore")
from sklearn.pipeline import Pipeline

In [509]:
df_text = pd.read_csv('/Users/Ellina/Desktop/DS552/HW/HW5/ps5_tweets_text.csv', sep=',',error_bad_lines=False,encoding='utf-8')
df_labels = pd.read_csv('ps5_tweets_labels.csv',sep=',',error_bad_lines=False,encoding='utf-8')
df_numbers = pd.read_csv('ps5_tweets_labels_as_numbers.csv',sep=',',error_bad_lines=False,encoding='utf-8')



In [510]:
df_text


Unnamed: 0,Id,Tweet
0,0,https://t.co/UpjxfOgQs8\r\r\n\r\r\nGaisss! Ple...
1,1,@mygovindia Today just after a week of lockdow...
2,2,Tuskys partners with Amref to provide on groun...
3,3,@chrissyteigen are u doing ur own grocery shop...
4,4,UK Critical Care Nurse Cries at Empty SuperMar...
...,...,...
37036,37036,Minnesota classifies grocery store workers as ...
37037,37037,US Senator @ewarren has asked for information ...
37038,37038,Just commented on @thejournal_ie: Poll: Are yo...
37039,37039,My wife got laid off yesterday because the sma...


In [511]:
df_labels


Unnamed: 0,Id,Sentiment
0,0,Extremely Positive
1,1,Negative
2,2,Neutral
3,3,Negative
4,4,Extremely Negative
...,...,...
37036,37036,Negative
37037,37037,Negative
37038,37038,Extremely Negative
37039,37039,Neutral


In [512]:
df_numbers


Unnamed: 0,Id,Label
0,0,4
1,1,1
2,2,2
3,3,1
4,4,0
...,...,...
37036,37036,1
37037,37037,1
37038,37038,0
37039,37039,2


In [513]:
print(df_text.shape)
print(df_labels.shape)
print(df_numbers.shape)

(37041, 2)
(37041, 2)
(37041, 2)


# 1. Data Preparing

In [514]:
#check some statistic 
print(df_labels['Sentiment'].value_counts())
print(df_numbers['Label'].value_counts())

Positive              10282
Negative               8930
Neutral                6930
Extremely Positive     5953
Extremely Negative     4946
Name: Sentiment, dtype: int64
3    10282
1     8930
2     6930
4     5953
0     4946
Name: Label, dtype: int64


In [515]:
df_text = df_text[['Tweet']]
df_text

Unnamed: 0,Tweet
0,https://t.co/UpjxfOgQs8\r\r\n\r\r\nGaisss! Ple...
1,@mygovindia Today just after a week of lockdow...
2,Tuskys partners with Amref to provide on groun...
3,@chrissyteigen are u doing ur own grocery shop...
4,UK Critical Care Nurse Cries at Empty SuperMar...
...,...
37036,Minnesota classifies grocery store workers as ...
37037,US Senator @ewarren has asked for information ...
37038,Just commented on @thejournal_ie: Poll: Are yo...
37039,My wife got laid off yesterday because the sma...


In [516]:
df_labels = df_labels[['Sentiment']]
df_labels

Unnamed: 0,Sentiment
0,Extremely Positive
1,Negative
2,Neutral
3,Negative
4,Extremely Negative
...,...
37036,Negative
37037,Negative
37038,Extremely Negative
37039,Neutral


In [517]:
df_numbers = df_numbers[['Label']]
df_numbers

Unnamed: 0,Label
0,4
1,1
2,2
3,1
4,0
...,...
37036,1
37037,1
37038,0
37039,2


In [518]:
df = pd.concat([df_text, df_labels, df_numbers], axis=1)
df

Unnamed: 0,Tweet,Sentiment,Label
0,https://t.co/UpjxfOgQs8\r\r\n\r\r\nGaisss! Ple...,Extremely Positive,4
1,@mygovindia Today just after a week of lockdow...,Negative,1
2,Tuskys partners with Amref to provide on groun...,Neutral,2
3,@chrissyteigen are u doing ur own grocery shop...,Negative,1
4,UK Critical Care Nurse Cries at Empty SuperMar...,Extremely Negative,0
...,...,...,...
37036,Minnesota classifies grocery store workers as ...,Negative,1
37037,US Senator @ewarren has asked for information ...,Negative,1
37038,Just commented on @thejournal_ie: Poll: Are yo...,Extremely Negative,0
37039,My wife got laid off yesterday because the sma...,Neutral,2


# 2. Exploratory Data Analysis

### Number of tweets in each sentiment

In [519]:
#Code Reference: https://github.com/miguelfzafra/Latest-News-Classifier/blob/master/0.%20Latest%20News%20Classifier/02.%20Exploratory%20Data%20Analysis/02.%20Exploratory%20Data%20Analysis.ipynb


In [520]:
bars = alt.Chart(df).mark_bar(size=50).encode(
    x=alt.X("Sentiment"),
    y=alt.Y("count():Q", axis=alt.Axis(title='Number of Tweets')),
    tooltip=[alt.Tooltip('count()', title='Number of Tweets'), 'Sentiment'],
    color='Sentiment'

)

text = bars.mark_text(
    align='center',
    baseline='bottom',
).encode(
    text='count()'
)

(bars + text).interactive().properties(
    height=300, 
    width=700,
    title = "Number of Tweets in each sentiment",
)

### Percentage of tweets in each sentiment

In [521]:
df['id'] = 1
df2 = pd.DataFrame(df.groupby('Sentiment').count()['id']).reset_index()

bars = alt.Chart(df2).mark_bar(size=50).encode(
    x=alt.X('Sentiment'),
    y=alt.Y('PercentOfTotal:Q', axis=alt.Axis(format='.0%', title='% of Tweets')),
    color='Sentiment'
).transform_window(
    TotalArticles='sum(id)',
    frame=[None, None]
).transform_calculate(
    PercentOfTotal="datum.id / datum.TotalArticles"
)

text = bars.mark_text(
    align='center',
    baseline='bottom',
    #dx=5  # Nudges text to right so it doesn't appear on top of the bar
).encode(
    text=alt.Text('PercentOfTotal:Q', format='.1%')
)

(bars + text).interactive().properties(
    height=300, 
    width=700,
    title = "% of tweets in each sentiment",
)

# 3. Data Preprocessing

- 1.Remove special character, URL, retweet
- 2.Lowercase
- 3.Puncation Signs
- 4.Possessive Pronouns 
- 5.Optional : Try Lemmatizer (WordNet Lemmatizer).

In [522]:
import nltk
import tensorflow as tf
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

nltk.download('punkt')
nltk.download('wordnet')
#Reference: https://github.com/miguelfzafra/Latest-News-Classifier/blob/master/0.%20Latest%20News%20Classifier/03.%20Feature%20Engineering/03.%20Feature%20Engineering.ipynb
#Reference: https://machinelearningmastery.com/clean-text-machine-learning-python/

[nltk_data] Downloading package punkt to /Users/Ellina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Ellina/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [523]:
df.head(10)


Unnamed: 0,Tweet,Sentiment,Label,id
0,https://t.co/UpjxfOgQs8\r\r\n\r\r\nGaisss! Ple...,Extremely Positive,4,1
1,@mygovindia Today just after a week of lockdow...,Negative,1,1
2,Tuskys partners with Amref to provide on groun...,Neutral,2,1
3,@chrissyteigen are u doing ur own grocery shop...,Negative,1,1
4,UK Critical Care Nurse Cries at Empty SuperMar...,Extremely Negative,0,1
5,@ymxr6 Makes my heart ache its the elderly tha...,Extremely Negative,0,1
6,COVID-19 wrecks aluminium prices and input cos...,Neutral,2,1
7,February Home Prices Increased by 4.1 Percent ...,Positive,3,1
8,Want advice on avoiding scams related to #COVI...,Extremely Negative,0,1
9,@dailyecho @BBCWatchdog @BBCNews @dailymail an...,Negative,1,1


### 3.1 Remove special character, URL, retweet

In [524]:
# 2.1 Remove special character, URL, retweet
df['Tweet_change_1'] = df['Tweet'].str.replace("\r", " ")
df['Tweet_change_1'] = df['Tweet_change_1'].str.replace("\n", " ")
df['Tweet_change_1'] = df['Tweet_change_1'].str.replace("\\", " ")
df['Tweet_change_1'] = df['Tweet_change_1'].str.replace("    ", " ")
df['Tweet_change_1'] = df['Tweet_change_1'].str.replace("''","")
df['Tweet_change_1'] = df['Tweet_change_1'].str.replace('http\S+', " ") #Remove URL
df['Tweet_change_1'] = df['Tweet_change_1'].str.replace('@\S+', " ") #Remove retweet
df['Tweet_change_1'] = df['Tweet_change_1'].str.replace('#\S+', " ") #Remove retweet


df['Tweet_change_1'].head()

0        Gaisss! Please read this,and please limit ...
1      Today just after a week of lockdown lot of c...
2    Tuskys partners with Amref to provide on groun...
3      are u doing ur own grocery shopping now like...
4    UK Critical Care Nurse Cries at Empty SuperMar...
Name: Tweet_change_1, dtype: object

### 3.2 Lowercase

In [525]:
# 2.2 Lowercase 
df['Tweet_change_2'] = df['Tweet_change_1'].str.lower()

df['Tweet_change_2'].head()


0        gaisss! please read this,and please limit ...
1      today just after a week of lockdown lot of c...
2    tuskys partners with amref to provide on groun...
3      are u doing ur own grocery shopping now like...
4    uk critical care nurse cries at empty supermar...
Name: Tweet_change_2, dtype: object

### 3.3 Puncation Signs

In [526]:
# 2.3 Puncation signs

df['Tweet_change_3'] = df['Tweet_change_2'].str.replace("?", " ")

df['Tweet_change_3'] = df['Tweet_change_2']

punctuation_signs = list("?.:!,;@#\//\|-\)\(\}\{\[\]\"$")

for punct_sign in punctuation_signs:
    df['Tweet_change_3'] = df['Tweet_change_3'].str.replace(punct_sign, " ")
    df['Tweet_change_3'] = df['Tweet_change_3'].str.replace(punct_sign, " ")


In [527]:
df['Tweet_change_3'].head()

0        gaisss  please read this and please limit ...
1      today just after a week of lockdown lot of c...
2    tuskys partners with amref to provide on groun...
3      are u doing ur own grocery shopping now like...
4    uk critical care nurse cries at empty supermar...
Name: Tweet_change_3, dtype: object

### 3.4 Possessive Pronouns 


In [528]:
# 2.4 Possessive Pronouns 
df['Tweet_change_4'] = df['Tweet_change_3'].str.replace("'s'", " ")

df['Tweet_change_4'].head()


0        gaisss  please read this and please limit ...
1      today just after a week of lockdown lot of c...
2    tuskys partners with amref to provide on groun...
3      are u doing ur own grocery shopping now like...
4    uk critical care nurse cries at empty supermar...
Name: Tweet_change_4, dtype: object

### 3.5 Stop Words

In [529]:
# Downloading the stop words list
nltk.download('stopwords')

# Loading the stop words in english
stop_words = list(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Ellina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [530]:
#2.5 Stopwords
for stop_word in stop_words:

    regex_stopword = r"\b" + stop_word + r"\b"
    df['Tweet_change_5'] = df['Tweet_change_4'].str.replace(regex_stopword, " ")

df['Tweet_change_5'].head()

0        gaisss  please read this and please limit ...
1      today just after a week of lockdown lot of c...
2    tuskys partners with amref to provide on groun...
3      are u doing ur own grocery shopping now like...
4    uk critical care nurse cries at empty supermar...
Name: Tweet_change_5, dtype: object

### 3.6 Lemmatizer


In [531]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Ellina/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [532]:
#Reference: https://www.cnblogs.com/jclian91/p/9898511.html
# 2.5 Lemmatizer
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer


def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

wnl = WordNetLemmatizer()
re_list_train = []
aaa = []

for sentence in df['Tweet_change_5']:
    lemmatized_text_list = []
    tokens = word_tokenize(sentence)   
    tagged_sent = pos_tag(tokens)     
    for tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
        lemmatized_text_list.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) 
        
    re_string = ""
    aaa += lemmatized_text_list
    for i in lemmatized_text_list:
        re_string = re_string + i + " "
    re_list_train.append(re_string)
   



In [533]:
df['Tweet_change_6'] = re_list_train
df['Tweet_change_6'].head()
df['Tweet_change_6']

0        gaisss please read this and please limit yours...
1        today just after a week of lockdown lot of con...
2        tuskys partner with amref to provide on ground...
3        be u do ur own grocery shopping now like a reg...
4        uk critical care nurse cry at empty supermarke...
                               ...                        
37036    minnesota classifies grocery store worker a em...
37037    u senator have ask for information about a the...
37038    just comment on poll be you do more online sho...
37039    my wife get lay off yesterday because the smal...
37040                                    humanity be doom 
Name: Tweet_change_6, Length: 37041, dtype: object

In [534]:
def preprocess(text):
    result = [i for i in text.split(' ') if (len(i)>1) and i not in stopwords]
    result = [lemmatizer.lemmatize(i) for i in result]
    words_count.update(result)
    return " ".join(result)

In [535]:
df

Unnamed: 0,Tweet,Sentiment,Label,id,Tweet_change_1,Tweet_change_2,Tweet_change_3,Tweet_change_4,Tweet_change_5,Tweet_change_6
0,https://t.co/UpjxfOgQs8\r\r\n\r\r\nGaisss! Ple...,Extremely Positive,4,1,"Gaisss! Please read this,and please limit ...","gaisss! please read this,and please limit ...",gaisss please read this and please limit ...,gaisss please read this and please limit ...,gaisss please read this and please limit ...,gaisss please read this and please limit yours...
1,@mygovindia Today just after a week of lockdow...,Negative,1,1,Today just after a week of lockdown lot of c...,today just after a week of lockdown lot of c...,today just after a week of lockdown lot of c...,today just after a week of lockdown lot of c...,today just after a week of lockdown lot of c...,today just after a week of lockdown lot of con...
2,Tuskys partners with Amref to provide on groun...,Neutral,2,1,Tuskys partners with Amref to provide on groun...,tuskys partners with amref to provide on groun...,tuskys partners with amref to provide on groun...,tuskys partners with amref to provide on groun...,tuskys partners with amref to provide on groun...,tuskys partner with amref to provide on ground...
3,@chrissyteigen are u doing ur own grocery shop...,Negative,1,1,are u doing ur own grocery shopping now like...,are u doing ur own grocery shopping now like...,are u doing ur own grocery shopping now like...,are u doing ur own grocery shopping now like...,are u doing ur own grocery shopping now like...,be u do ur own grocery shopping now like a reg...
4,UK Critical Care Nurse Cries at Empty SuperMar...,Extremely Negative,0,1,UK Critical Care Nurse Cries at Empty SuperMar...,uk critical care nurse cries at empty supermar...,uk critical care nurse cries at empty supermar...,uk critical care nurse cries at empty supermar...,uk critical care nurse cries at empty supermar...,uk critical care nurse cry at empty supermarke...
...,...,...,...,...,...,...,...,...,...,...
37036,Minnesota classifies grocery store workers as ...,Negative,1,1,Minnesota classifies grocery store workers as ...,minnesota classifies grocery store workers as ...,minnesota classifies grocery store workers as ...,minnesota classifies grocery store workers as ...,minnesota classifies grocery store workers as ...,minnesota classifies grocery store worker a em...
37037,US Senator @ewarren has asked for information ...,Negative,1,1,US Senator has asked for information about ...,us senator has asked for information about ...,us senator has asked for information about ...,us senator has asked for information about ...,us senator has asked for information about ...,u senator have ask for information about a the...
37038,Just commented on @thejournal_ie: Poll: Are yo...,Extremely Negative,0,1,Just commented on Poll: Are you doing more o...,just commented on poll: are you doing more o...,just commented on poll are you doing more o...,just commented on poll are you doing more o...,just commented on poll are you doing more o...,just comment on poll be you do more online sho...
37039,My wife got laid off yesterday because the sma...,Neutral,2,1,My wife got laid off yesterday because the sma...,my wife got laid off yesterday because the sma...,my wife got laid off yesterday because the sma...,my wife got laid off yesterday because the sma...,my wife got laid off yesterday because the sma...,my wife get lay off yesterday because the smal...


In [536]:
selected_columns = ['Tweet_change_6','Label']
df = df[selected_columns]
df = df.rename(columns={'Tweet_change_6':'Tweet_Clean'})


In [537]:
df

Unnamed: 0,Tweet_Clean,Label
0,gaisss please read this and please limit yours...,4
1,today just after a week of lockdown lot of con...,1
2,tuskys partner with amref to provide on ground...,2
3,be u do ur own grocery shopping now like a reg...,1
4,uk critical care nurse cry at empty supermarke...,0
...,...,...
37036,minnesota classifies grocery store worker a em...,1
37037,u senator have ask for information about a the...,1
37038,just comment on poll be you do more online sho...,0
37039,my wife get lay off yesterday because the smal...,2


# 4. Split Train + Test

In [538]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

In [539]:
y = df['Label']
X = df['Tweet_Clean']

#Split Training & Testing Set
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

#Split Training Set again into Training & Validation Set
#Xtrain, Xval, ytrain, yval = train_test_split(Xtrain, ytrain, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2


print(Xtrain.shape, ytrain.shape)
#print(Xval.shape, yval.shape)
print(Xtest.shape, ytest.shape)

(29632,) (29632,)
(7409,) (7409,)


In [540]:
from string import punctuation
from os import listdir
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

# 4. Word Embedding + CNN

### 4.1 Word Embedding with Keras

In [541]:
#Reference: https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
# create the tokenizer
tokenizer = Tokenizer(num_words = 10000, split=" ")

# fit the tokenizer on the documents
tokenizer.fit_on_texts(Xtrain)

# texts_to_sequences (convert text to sequence)
seq_Xtrain = tokenizer.texts_to_sequences(Xtrain)
seq_Xtest = tokenizer.texts_to_sequences(Xtest)

In [542]:
vocab_count = len(tokenizer.word_index) + 1
print("There are %d different vocabularies in total after tokenization"%vocab_count)


There are 24859 different vocabularies in total after tokenization


In [543]:
# Maximun words
max_length = max([len(str(s).split()) for s in Xtrain])
print('The maximun length of comment has %d words'%max_length)


The maximun length of comment has 66 words


In [544]:
#pad sequences
pad_Xtrain = pad_sequences(seq_Xtrain, maxlen=max_length, padding='post')
pad_Xtest = pad_sequences(seq_Xtest, maxlen=max_length, padding='post')

print('pad sequence of Xtrain:', pad_Xtrain.shape)
print('pad sequence of Xtest:',pad_Xtest.shape)

pad sequence of Xtrain: (29632, 66)
pad sequence of Xtest: (7409, 66)


### 4.2 CNN

In [545]:
#Reference: https://machinelearningmastery.com/develop-word-embedding-model-predicting-movie-review-sentiment/
# define model
model = Sequential([])
model.add(Embedding(vocab_count, 100, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(5, activation='softmax'))


In [546]:
print(model.summary())

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 66, 100)           2485900   
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 59, 32)            25632     
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 29, 32)            0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 928)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 10)                9290      
_________________________________________________________________
dense_17 (Dense)             (None, 5)                 55        
Total params: 2,520,877
Trainable params: 2,520,877
Non-trainable params: 0
____________________________________________

### 4.3 CNN compile and fit

In [559]:
import tensorflow as tf
from tensorflow import keras
# compile network
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])


In [560]:
from keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss',mode='min')

In [561]:
# fit network
model.fit(pad_Xtrain, ytrain, epochs=5, validation_split=0.2, batch_size=32, callbacks=[early_stop])



Epoch 1/5
Epoch 2/5
Epoch 3/5


<tensorflow.python.keras.callbacks.History at 0x19a796220>

In [562]:
loss, acc = model.evaluate(pad_Xtest, ytest, verbose=0)
print('The accuracy of CNN is %.3f'%(acc))

The accuracy of CNN is 0.736


# 5. Bag-of-words method with MultiNB

In [486]:
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

### 5.1 Word Count

In [487]:
#CountVectorizer
count_v = CountVectorizer(min_df = 5, binary = True)
word_count_matrix = count_v.fit_transform(df['Tweet_Clean'])
word_count_matrix

<37041x7121 sparse matrix of type '<class 'numpy.int64'>'
	with 838992 stored elements in Compressed Sparse Row format>

In [488]:
count_list = word_count_matrix.toarray().sum(axis=0)
word_list = count_v.get_feature_names()

### 5.2 TF-IDF

In [489]:
#TF-IDF (convert text to matrix)
tf_idf = TfidfTransformer()
tf_idf.fit(word_count_matrix)
tf_idf.idf_

array([7.11308844, 5.51839281, 9.72804821, ..., 8.18760317, 9.72804821,
       8.26171114])

In [490]:
tf_idf.idf_.shape

(7121,)

In [491]:
tf_idf_vector = tf_idf.transform(word_count_matrix)
tf_idf_vector

<37041x7121 sparse matrix of type '<class 'numpy.float64'>'
	with 838992 stored elements in Compressed Sparse Row format>

In [492]:
pipe = Pipeline([
    ('vector', CountVectorizer()), 
    ('tfidf', TfidfTransformer()), 
    ('mulNB', MultinomialNB())
])

In [493]:
pipe.fit(Xtrain, ytrain)

Pipeline(steps=[('vector', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('mulNB', MultinomialNB())])

In [494]:
pipe.score(Xtest, ytest)

0.3866918612498313

### 5.3 GridSearchCV + Multi-NB

In [495]:
#Reference: https://www.kaggle.com/tonypeng1/tf-idf-with-multinomial-nb-and-cross-validation/comments
#Using GridSearchCV
from time import time
parameters = {
    'mulNB__alpha': [1, 0.7, 0.4, 0.2, 0.1, 0.09, 0.08, 0.07, 0.06, 0.03, 0.01] 
}

grid = GridSearchCV(pipe, param_grid=parameters, cv=10, refit=True)
t0 = time()
grid.fit(Xtrain, ytrain)
print("done in %0.3fs" % (time() - t0))

done in 308.779s


In [496]:

print(grid.best_score_)
print(grid.best_params_)

0.44556585881745303
{'mulNB__alpha': 0.06}


In [497]:
score = grid.score(Xtest, ytest)
print('The accuracy of MultiNB with TF-IDF is %.03f'%(score))

The accuracy of MultiNB with TF-IDF is 0.457
