# Naive Bayes Model

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import re

# Test Naive Bayes with raw data

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

#load the test and training data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

def test_model(train, test):
    #We will use the CountVectorizer function to vectorize and count our data
    vector = CountVectorizer()
    X = vector.fit_transform(train.text)
    Y = train.target.values
    P = vector.transform(test.text)

    # Now we split the data using 25% test and 75% train 
    # We use a state of 101 so outputs are consistent after we fine tune
    xtrain,xtest,ytrain,ytest = train_test_split(X,Y,test_size=0.25,train_size=0.75,random_state=101)

    #We plug our sets into the models and print the results
    mnb = MultinomialNB()
    model_mnb = mnb.fit(xtrain,ytrain)
    ts_pred_mnb = model_mnb.predict(xtest)
    print("Classification report for xtest data:-\n\n",classification_report(ytest,ts_pred_mnb),"\n")
    
test_model(train_df, test_df)


Classification report for xtest data:-

               precision    recall  f1-score   support

           0       0.80      0.86      0.83      1112
           1       0.78      0.70      0.74       792

    accuracy                           0.79      1904
   macro avg       0.79      0.78      0.78      1904
weighted avg       0.79      0.79      0.79      1904
 



These results are our baseline. Now lets fine tune and see if we can get better results!

# Finetuning:

First lets try taking out stop words and see if this increases the performance of our model

In [16]:
import nltk
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english","vis"))

def remove_stop_words(text):
    words = text.split()  # Tokenize the text
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

# Apply the function to the "text" column and save as a new df
train_df_1 = train_df.copy()
train_df_1["text"] = train_df["text"].apply(remove_stop_words)
print(train_df["text"].head())


0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object


In [17]:
test_model(train_df_1, test_df)

Classification report for xtest data:-

               precision    recall  f1-score   support

           0       0.81      0.84      0.82      1112
           1       0.76      0.72      0.74       792

    accuracy                           0.79      1904
   macro avg       0.79      0.78      0.78      1904
weighted avg       0.79      0.79      0.79      1904
 



Our end goal is to identify tweets that relate to disasters for emergency responders. Therefore, with our model we should aim to have high percision. Recall is less important because it would require knowing whether or not the tweet related to a disaster after the fact which is useless to first responders. So clearly removing stop words did not optimize the data for our model.

In [27]:
from collections import Counter

all_text = " ".join(train_df_1[train_df_1['target']==1]['text'])

# Tokenize the text into words
words = all_text.split()

# Count the occurrences of each word
word_counts = Counter(words)

# Get the top 20 most common words
top_20_words = word_counts.most_common(20)

# Print the results
for word, count in top_20_words:
    print(f"{word}: {count}")

-: 389
via: 115
fire: 108
...: 106
&amp;: 105
California: 86
killed: 86
like: 85
people: 83
suicide: 71
2: 67
Hiroshima: 59
disaster: 59
Northern: 58
bombing: 56
bomber: 56
crash: 55
bomb: 55
families: 54
fires: 53


In [23]:
def count_substring_by_target(dataframe, substring):
    # Filter the DataFrame based on target values
    target_1 = dataframe[dataframe["target"] == 1]
    target_0 = dataframe[dataframe["target"] == 0]

    # Count occurrences of the substring in the "text" column
    count_target_1 = target_1["text"].str.count(substring).sum()
    count_target_0 = target_0["text"].str.count(substring).sum()

    print(f" with target 1: {count_target_1}")
    print(f" with target 0: {count_target_0}")
    
count_substring_by_target(train_df, "arsonist")


 with target 1: 3
 with target 0: 10
