In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import CountVectorizer

In [30]:
df= pd.read_csv('disaster_tweets_data(DS).csv')
df.head()

Unnamed: 0,tweets,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [32]:
df.isnull() #checking null values

Unnamed: 0,tweets,target
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
7608,False,False
7609,False,False
7610,False,False
7611,False,False


In [33]:
df.dtypes #checking types

tweets    object
target     int64
dtype: object

In [34]:
df.duplicated() # checking duplicacy

0       False
1       False
2       False
3       False
4       False
        ...  
7608    False
7609     True
7610     True
7611     True
7612    False
Length: 7613, dtype: bool

In [35]:
# Preprocessing the text data
def preprocess_text(text):
    text = text.lower() # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    words = text.split()  # Tokenize words
    # Remove stop words (you can use NLTK or a predefined list)
    stop_words = set(['the', 'and', 'is', 'in', 'to', 'of'])  # Add more as needed
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)


In [38]:
# Apply preprocessing to the 'tweets' column
df['cleaned_tweets'] = df['tweets'].apply(preprocess_text)
df.head()

Unnamed: 0,tweets,target,cleaned_tweets
0,Our Deeds are the Reason of this #earthquake M...,1,our deeds are reason this earthquake may allah...
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are ...,1,all residents asked shelter place are being no...
3,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...
4,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...


In [39]:
df.drop(columns=['tweets'],inplace = True)
df.head()

Unnamed: 0,target,cleaned_tweets
0,1,our deeds are reason this earthquake may allah...
1,1,forest fire near la ronge sask canada
2,1,all residents asked shelter place are being no...
3,1,13000 people receive wildfires evacuation orde...
4,1,just got sent this photo from ruby alaska as s...


In [42]:
#vectorize the text data
cv=CountVectorizer()
x= cv.fit_transform(df['cleaned_tweets'])
y= df['target']

In [43]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.30, random_state= 42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(5329, 22661)
(2284, 22661)
(5329,)
(2284,)


In [44]:
#Implementing Naive Bayes
from sklearn.naive_bayes import MultinomialNB
m1= MultinomialNB()
m1.fit(x_train, y_train)


In [45]:
#Accuracy
print('Train score', m1.score(x_train,y_train))
print('Test score', m1.score(x_test,y_test))

Train score 0.9191217864514918
Test score 0.7990367775831874


In [46]:
ypred_m1 = m1.predict(x_test)
print(ypred_m1)

[0 0 0 ... 1 1 1]


In [47]:
from sklearn.metrics import confusion_matrix, classification_report

In [48]:
print(confusion_matrix(y_test,ypred_m1)) #ypred_m1 instead of y_pred
print(classification_report(y_test,ypred_m1))
#caluclating the accuracy
accuracy_model1 = m1.score(x_test,y_test)
accuracy_model1

[[1141  177]
 [ 282  684]]
              precision    recall  f1-score   support

           0       0.80      0.87      0.83      1318
           1       0.79      0.71      0.75       966

    accuracy                           0.80      2284
   macro avg       0.80      0.79      0.79      2284
weighted avg       0.80      0.80      0.80      2284



0.7990367775831874

In [49]:
#Implementing Logistic Regression
from sklearn.linear_model import LogisticRegression
m2= LogisticRegression(solver="liblinear")
m2.fit(x_train,y_train)
#accuracy
print('Train_score',m2.score(x_train,y_train))
print('Test_score',m2.score(x_test,y_test))

Train_score 0.9771063989491462
Test_score 0.808231173380035


In [50]:
ypred_m2= m2.predict(x_test)
print(ypred_m2)

[0 0 0 ... 1 1 0]


In [51]:
from sklearn.metrics import confusion_matrix, classification_report
confusion_m2 = confusion_matrix(y_test,ypred_m2)
print(confusion_m2)
print(classification_report(y_test,ypred_m2))
#caluclating the accuracy
accuracy_model2 = m2.score(x_test,y_test)
accuracy_model2

[[1160  158]
 [ 280  686]]
              precision    recall  f1-score   support

           0       0.81      0.88      0.84      1318
           1       0.81      0.71      0.76       966

    accuracy                           0.81      2284
   macro avg       0.81      0.80      0.80      2284
weighted avg       0.81      0.81      0.81      2284



0.808231173380035

In [52]:
# Implementing KNN 
from sklearn.neighbors import KNeighborsClassifier

m3 = KNeighborsClassifier(n_neighbors=85)
m3.fit(x_train,y_train)
#accuracy
print('Train_score',m3.score(x_train,y_train))
print('Test_score',m3.score(x_test,y_test))

Train_score 0.5674610621129668
Test_score 0.5774956217162872


In [53]:
ypred_m3 = m3.predict(x_test)
print(ypred_m3)

[0 0 0 ... 0 0 0]


In [54]:
confusion_m3 = confusion_matrix(y_test,ypred_m3)
print(confusion_m3)
print(classification_report(y_test,ypred_m3))
#caluclating the accuracy
accuracy_model3 = m3.score(x_test,y_test)
accuracy_model3

[[1318    0]
 [ 965    1]]
              precision    recall  f1-score   support

           0       0.58      1.00      0.73      1318
           1       1.00      0.00      0.00       966

    accuracy                           0.58      2284
   macro avg       0.79      0.50      0.37      2284
weighted avg       0.76      0.58      0.42      2284



0.5774956217162872

In [55]:
# Reporting the best acurracy
best_model = max(accuracy_model1,accuracy_model2,accuracy_model3)
if (best_model == accuracy_model1):
    print("MNB Classification has the best accuracy with the given dataset", m1.score(x_test,y_test) * 100, "%")
elif (best_model == accuracy_model2):
    print("Logistic Regression has the best accuracy with the given dataset", m2.score(x_test,y_test) * 100, "%")
elif (best_model == accuracy_model3):
    print("KNN Classifer has the best accuracy with the given dataset", m3.score(x_test,y_test) * 100, "%")

Logistic Regression has the best accuracy with the given dataset 80.8231173380035 %
