# Bernoulli Method 

In [32]:
import numpy as np
import pandas as pd 
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

print("Important libraries loaded successfully")

Important libraries loaded successfully


In [33]:
file_path = ("Resources/test.csv")
test_df = pd.read_csv(file_path)
test_df

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [34]:
file_path = ("Resources/train.csv") 
train_df = pd.read_csv(file_path)
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


# Total Missing Data 

In [35]:
total = train_df.isnull().sum().sort_values(ascending = False)

percent = (train_df.isnull().sum()/train_df.isnull().count()).sort_values(ascending = False)

print("Missing Train Data Table")
missing_data = pd.concat([total, percent], axis = 1, keys=['Total', 'Percent'])
missing_data



Missing Train Data Table


Unnamed: 0,Total,Percent
location,2533,0.33272
keyword,61,0.008013
id,0,0.0
text,0,0.0
target,0,0.0


In [36]:
total = test_df.isnull().sum().sort_values(ascending = False)

percent = (test_df.isnull().sum()/test_df.isnull().count()).sort_values(ascending = False)

print("Missing Test Data Table")
missing_data = pd.concat([total, percent], axis = 1, keys=['Total', 'Percent'])
missing_data

Missing Test Data Table


Unnamed: 0,Total,Percent
location,1105,0.338645
keyword,26,0.007968
id,0,0.0
text,0,0.0


In [37]:
train_dropped_df = train_df.drop(["location", "keyword", "id"], axis = 1)
print("location, id, and keyword columns droped successfully")
train_dropped_df

location, id, and keyword columns droped successfully


Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
7608,Two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @TheTawniest The out of control w...,1
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,Police investigating after an e-bike collided ...,1


In [38]:
test_dropped_df = test_df.drop(["location", "keyword", "id"], axis = 1)
print("location, id, and keyword columns droped successfully")
test_dropped_df

location, id, and keyword columns droped successfully


Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan
...,...
3258,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,Storm in RI worse than last hurricane. My city...
3260,Green Line derailment in Chicago http://t.co/U...
3261,MEG issues Hazardous Weather Outlook (HWO) htt...


In [80]:
def clean_tweets(df):
    corpus  = []
    pstem = PorterStemmer()
    for i in range(df.shape[0]):
        #Remove unwanted words
        tweet = re.sub("[^a-zA-Z]", ' ', df[i])
        #Transform words to lowercase
        tweet = tweet.lower()
        tweet = tweet.split()
        #Remove stopwords then Stemming it
        tweet = [pstem.stem(word) for word in tweet if not word in set(stopwords.words('english'))]
        tweet = ' '.join(tweet)
        #Append cleaned tweet to corpus
        corpus.append(tweet)
    return corpus
print("Corpus created successfully")

Corpus created successfully


In [81]:
# Clean Training Data 
train_tweet = clean_tweets(train_dropped_df['text'])
train_tweet = pd.DataFrame(train_tweet)
train_tweet

Unnamed: 0,0
0,deed reason earthquak may allah forgiv us
1,forest fire near la rong sask canada
2,resid ask shelter place notifi offic evacu she...
3,peopl receiv wildfir evacu order california
4,got sent photo rubi alaska smoke wildfir pour ...
...,...
7608,two giant crane hold bridg collaps nearbi home...
7609,aria ahrari thetawniest control wild fire cali...
7610,utc km volcano hawaii http co zdtoyd ebj
7611,polic investig e bike collid car littl portug ...


In [55]:
# Append cleaned tweets to the trained data 
train_dropped_df['clean_tweet'] = train_tweet

# Compare the cleaned and uncleaned tweets 
train_dropped_df.head()

Unnamed: 0,text,target,clean_tweet
0,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquak may allah forgiv us
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong sask canada
2,All residents asked to 'shelter in place' are ...,1,resid ask shelter place notifi offic evacu she...
3,"13,000 people receive #wildfires evacuation or...",1,peopl receiv wildfir evacu order california
4,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo rubi alaska smoke wildfir pour ...


In [50]:
# Clean Testing Data 
test_tweet = clean_tweets(test_dropped_df['text'])
test_tweet = pd.DataFrame(train_tweet)
test_tweet

Unnamed: 0,0
0,deed reason earthquak may allah forgiv us
1,forest fire near la rong sask canada
2,resid ask shelter place notifi offic evacu she...
3,peopl receiv wildfir evacu order california
4,got sent photo rubi alaska smoke wildfir pour ...
...,...
7608,two giant crane hold bridg collaps nearbi home...
7609,aria ahrari thetawniest control wild fire cali...
7610,utc km volcano hawaii http co zdtoyd ebj
7611,polic investig e bike collid car littl portug ...


In [56]:
# Append cleaned tweets to the trained data 
test_dropped_df['clean_tweet'] = test_tweet

# Compare the cleaned and uncleaned tweets 
test_dropped_df.head()

Unnamed: 0,text,clean_tweet
0,Just happened a terrible car crash,deed reason earthquak may allah forgiv us
1,"Heard about #earthquake is different cities, s...",forest fire near la rong sask canada
2,"there is a forest fire at spot pond, geese are...",resid ask shelter place notifi offic evacu she...
3,Apocalypse lighting. #Spokane #wildfires,peopl receiv wildfir evacu order california
4,Typhoon Soudelor kills 28 in China and Taiwan,got sent photo rubi alaska smoke wildfir pour ...


In [58]:
#Create our dictionary 
def unique_words(df):
    uniqueWordFrequents = {}
    for tweet in df:
        for word in tweet.split():
            if(word in uniqueWordFrequents.keys()):
                uniqueWordFrequents[word] += 1
            else:
                uniqueWordFrequents[word] = 1

    #Convert dictionary to dataFrame
    uniqueWordFrequents = pd.DataFrame.from_dict(uniqueWordFrequents,orient='index',columns=['Word Frequent'])
    uniqueWordFrequents.sort_values(by=['Word Frequent'], inplace=True, ascending=False)
    uniqueWordFrequents.head(10)
    return uniqueWordFrequents

In [62]:
# Unique words for Training Data 
train_unique = unique_words(train_dropped_df['clean_tweet'])
train_unique = pd.DataFrame(train_unique)
train_unique.head(10)

Unnamed: 0,Word Frequent
co,4746
http,4721
like,411
fire,363
amp,344
get,311
bomb,239
new,228
via,220
u,216


In [63]:
# Unique words for Testing Data 
test_unique = unique_words(test_dropped_df['clean_tweet'])
test_unique = pd.DataFrame(test_unique)
test_unique.head(10)

Unnamed: 0,Word Frequent
co,1960
http,1954
like,176
amp,172
burn,161
get,146
bomb,139
emerg,137
build,130
fire,120


In [64]:
# Word frequency for training data
train_unique["Word Frequent"].unique( )


array([4746, 4721,  411,  363,  344,  311,  239,  228,  220,  216,  213,
        210,  209,  201,  183,  181,  180,  178,  175,  169,  166,  164,
        162,  156,  155,  153,  151,  145,  144,  143,  137,  133,  132,
        131,  130,  129,  128,  125,  124,  123,  122,  121,  120,  119,
        118,  117,  116,  114,  111,  110,  109,  108,  106,  105,  104,
        103,  102,  101,  100,   99,   98,   97,   96,   95,   94,   93,
         91,   90,   89,   88,   87,   86,   84,   83,   82,   79,   78,
         77,   76,   75,   74,   73,   72,   71,   70,   69,   68,   67,
         66,   65,   64,   63,   62,   61,   60,   59,   58,   57,   56,
         55,   54,   53,   52,   51,   50,   49,   48,   47,   46,   45,
         44,   43,   42,   41,   40,   39,   38,   37,   36,   35,   34,
         33,   32,   31,   30,   29,   28,   27,   26,   25,   24,   23,
         22,   21,   20,   19,   18,   17,   16,   15,   14,   13,   12,
         11,   10,    9,    8,    7,    6,    5,   

In [65]:
# Word frequency for testing data
test_unique["Word Frequent"].unique( )


array([1960, 1954,  176,  172,  161,  146,  139,  137,  130,  120,  119,
        115,  112,  110,  109,  108,   99,   97,   95,   94,   93,   91,
         89,   88,   86,   85,   84,   77,   72,   71,   70,   69,   67,
         66,   65,   64,   63,   62,   61,   60,   59,   58,   57,   56,
         55,   54,   53,   52,   51,   49,   48,   47,   46,   45,   44,
         43,   42,   41,   40,   39,   38,   37,   36,   35,   34,   33,
         32,   31,   30,   29,   28,   27,   26,   25,   24,   23,   22,
         21,   20,   19,   18,   17,   16,   15,   14,   13,   12,   11,
         10,    9,    8,    7,    6,    5,    4,    3,    2,    1])

In [None]:
# We will get only words that repeated more than or equal 20 once.

In [66]:
#Unique words for training data with words that repeated more than or equal 20 once.
train_unique = train_unique[train_unique['Word Frequent'] >= 20]
print(train_unique.shape)
train_unique


(787, 1)


Unnamed: 0,Word Frequent
co,4746
http,4721
like,411
fire,363
amp,344
...,...
cnn,20
gem,20
captur,20
arriv,20


In [67]:
#Unique words for testing data with words that repeated more than or equal 20 once.
test_unique = test_unique[test_unique['Word Frequent'] >= 20]
print(test_unique.shape)
test_unique


(287, 1)


Unnamed: 0,Word Frequent
co,1960
http,1954
like,176
amp,172
burn,161
...,...
rise,20
confirm,20
mishap,20
believ,20


In [86]:
counVec = CountVectorizer(max_features = train_unique.shape[0])
bagOfWords = counVec.fit_transform(train_dropped_df["clean_tweet"]).toarray()

In [88]:
X = bagOfWords
y = train_df['target']

print("X shape = ",X.shape)
print("y shape = ",y.shape)



X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.20, random_state=55, shuffle =True)
print('data splitting successfully')

X shape =  (7613, 787)
y shape =  (7613,)
data splitting successfully


In [89]:
bernoulliNBModel = BernoulliNB(alpha=0.1)
bernoulliNBModel.fit(X_train,y_train)

print("bernoulliNB model run successfully")

bernoulliNB model run successfully


In [91]:
gaussianNBModel = GaussianNB()
gaussianNBModel.fit(X_train,y_train)

print("gaussianNB model run successfully")

gaussianNB model run successfully


In [92]:
multinomialNBModel = MultinomialNB(alpha=0.1)
multinomialNBModel.fit(X_train,y_train)

print("multinomialNB model run successfully")

multinomialNB model run successfully


In [93]:
#evaluation Details
models = [ bernoulliNBModel, gaussianNBModel, multinomialNBModel]

for model in models:
    print(type(model).__name__,' Train Score is   : ' ,model.score(X_train, y_train))
    print(type(model).__name__,' Test Score is    : ' ,model.score(X_test, y_test))
    
    y_pred = model.predict(X_test)
    print(type(model).__name__,' F1 Score is      : ' ,f1_score(y_test,y_pred))
    print('--------------------------------------------------------------------------')


BernoulliNB  Train Score is   :  0.8091954022988506
BernoulliNB  Test Score is    :  0.7774130006565988
BernoulliNB  F1 Score is      :  0.7129551227773073
--------------------------------------------------------------------------
GaussianNB  Train Score is   :  0.7893267651888342
GaussianNB  Test Score is    :  0.7669074195666448
GaussianNB  F1 Score is      :  0.6728110599078342
--------------------------------------------------------------------------
MultinomialNB  Train Score is   :  0.8022988505747126
MultinomialNB  Test Score is    :  0.7734734077478661
MultinomialNB  F1 Score is      :  0.7165160230073953
--------------------------------------------------------------------------
