# Disaster Tweets Prediction

## Preparation

Import libraries

In [1]:
import numpy as np
import pandas as pd
import re

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [2]:
training_set = pd.read_csv("datasets/train.csv")
testing_set = pd.read_csv("datasets/test.csv")

In [3]:
training_set.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
training_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
training_set.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [6]:
class_dist = training_set.groupby("target").size()

for index, val in class_dist.iteritems():
    percentage = (val / sum(class_dist) * 100)
    print(f"Class {index} : {val} samples ({percentage:.2f}%)")

Class 0 : 4342 samples (57.03%)
Class 1 : 3271 samples (42.97%)


In [7]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
    return text

In [8]:
training_set["text"] = training_set["text"].apply(preprocessor)

In [9]:
training_set.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this earthquake ma...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,13 000 people receive wildfires evacuation ord...,1
4,7,,,just got sent this photo from ruby alaska as s...,1


In [10]:
X_train = training_set["text"]
y_train = training_set["target"]

In [11]:
vectorizer = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBClassifier

pipelines = []
pipelines.append(('XGBoost' , (Pipeline([('vectorizer' , vectorizer),('XGB' ,XGBClassifier())]))))
pipelines.append(('scaledLR' , (Pipeline([('vectorizer' , vectorizer),('LR' ,LogisticRegression())]))))
pipelines.append(('scaledKNN' , (Pipeline([('vectorizer' , vectorizer),('KNN' ,KNeighborsClassifier())]))))
pipelines.append(('scaledDT' , (Pipeline([('vectorizer' , vectorizer),('DT' ,DecisionTreeClassifier())]))))
pipelines.append(('scaledSVC' , (Pipeline([('vectorizer' , vectorizer),('SVC' ,SVC())]))))


model_name = []
results = []
for pipe ,model in pipelines:
    kfold = KFold(n_splits=10)
    crossv_results = cross_val_score(model , X_train ,y_train ,cv =kfold , scoring='f1')
    results.append(crossv_results)
    model_name.append(pipe)
    msg = "%s: %f (%f)" % (pipe, crossv_results.mean(), crossv_results.std())
    print(msg)







































XGBoost: 0.609926 (0.032419)
scaledLR: 0.696436 (0.042524)
scaledKNN: 0.645220 (0.035777)
scaledDT: 0.555770 (0.026589)
scaledSVC: 0.694301 (0.036725)


In [13]:
clf_method = SVC()

clf = Pipeline([
     ('vectorizer', vectorizer),
     ('classifier', clf_method)
 ])

In [14]:
clf.fit(X_train, y_train)

In [16]:
y_train_predicted = clf.predict(X_train)

In [23]:
training_accuracy = accuracy_score(y_train, y_train_predicted)
print(f'Accuracy on training set: {training_accuracy:.3f}')


Accuracy on training set: 0.967


In [25]:
from sklearn.metrics import classification_report

print(classification_report(y_train, y_train_predicted))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      4342
           1       0.99      0.93      0.96      3271

    accuracy                           0.97      7613
   macro avg       0.97      0.96      0.97      7613
weighted avg       0.97      0.97      0.97      7613



In [18]:
X_test = testing_set["text"]

In [19]:
y_test_predicted = clf.predict(X_test)

In [20]:
id = testing_set["id"].copy()
target = pd.DataFrame(y_test_predicted, columns=["target"])

In [21]:
submission = pd.merge(id, target, left_index=True, right_index=True)
submission

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1


In [22]:
submission.to_csv("datasets/submission.csv", index=False)