In [48]:
# Import libraries
import pandas as pd
import numpy as np
import nltk

# Machine learning relevant libraries
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import StratifiedShuffleSplit,GroupShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [49]:
# Reading in the compiled csv of all different tweets
df = pd.read_csv('./data/all_disaster_tweets.csv')

In [50]:
# Show the data
df.head()

Unnamed: 0,tweet text,disaster type
0,HOY es nuestro dia @Maria_21_music #nerea #an...,99.0
1,Yeahh yeahh esa jenga @alejandroalija @Telarer...,99.0
2,a Mãe do meu amigo #Pablo. morreu :( ta muito ...,99.0
3,Madrinas y apadrinados #SantaCecilia2012 @Carm...,99.0
4,11-26-12 | E-Games Attendant | Chada Bingo Hou...,99.0


In [51]:
# Check for nulls
df.isnull().sum()

tweet text        0
disaster type    10
dtype: int64

In [52]:
# Dropping NaNs
df.dropna(inplace=True)

In [53]:
# Turn all the tweets into lowercase before vectorization
df['tweet text'] = df['tweet text'].apply(lambda x: x.lower())

In [54]:
# Creating a column identifying floods and no floods to create a one-class classifier
df['flood'] = df['disaster type'].apply(lambda x: 1 if x == 4 else 0)

In [55]:
# Creating the X and y lists before splitting
X = df['tweet text']
y = df['flood']

In [56]:
# Train test split for single classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 42)

In [57]:
# Pipeline for single classifier
pipe1 = Pipeline([
    
    ('cvec',CountVectorizer()),
    ('lr',LogisticRegression(penalty = 'l2', solver='liblinear', random_state = 42))
    
])

pipe1_params = {
    
    'cvec__max_features': [2000],
    'cvec__lowercase': [True],
    'cvec__stop_words': ['english'],
    'cvec__ngram_range': [(1, 1),(1, 2)],
    
}

In [58]:
# Grid searching for single classifier
gs1 = GridSearchCV(pipe1,
                  pipe1_params,
                  cv=3,
                  return_train_score=True) 

In [59]:
# Fit the model
gs1.fit(X_train, y_train);

In [60]:
# Find the best parameters
gs1.best_params_

{'cvec__lowercase': True,
 'cvec__max_features': 2000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english'}

In [61]:
# Score the training data
print(f'CVEC/LR Training Score: {gs1.score(X_train, y_train)}')

CVEC/LR Training Score: 0.9874934364408803


In [62]:
# Score the testing data
print(f'CVEC/LR Testing Score: {gs1.score(X_test, y_test)}')

CVEC/LR Testing Score: 0.9785223367697594


### 1) Multiclass Classification with 5 Disaster Types

In [63]:
# Creating the y vairable as a vector of different disaster types
y_multi = df['disaster type']

In [64]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_multi, random_state = 42, stratify = y_multi)

In [65]:
# Establish a pipeline to vectorize and classify the text data
pipe2 = Pipeline([
    
    ('cvec',CountVectorizer()),
    ('multi',OneVsRestClassifier(LogisticRegression(C = 0.01, penalty = 'l2', solver='liblinear', random_state = 42)))
    
])

# Parameter dictionary to use for gridsearching
pipe2_params = {
    
    'cvec__max_features':[4000],
    'cvec__lowercase':[True],
    'cvec__stop_words':['english'],
    'cvec__ngram_range':[(1, 1),(1, 2)]
    
}

In [66]:
# Gridsearch over the parameter dictionary associated with the above pipeline
gs2 = GridSearchCV(pipe2,
                  pipe2_params,
                  cv=3,
                  return_train_score=True) 

In [67]:
# Fit the model
gs2.fit(X_train, y_train);

In [68]:
# Find the best model parameters
gs2.best_params_

{'cvec__lowercase': True,
 'cvec__max_features': 4000,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english'}

In [69]:
# Find the training score
print(f'CVEC/OVR Training Score: {gs2.score(X_train, y_train)}')

CVEC/OVR Training Score: 0.8830493102296052


In [70]:
# Find the testing score
print(f'CVEC/OVR Testing Score: {gs2.score(X_test, y_test)}')

CVEC/OVR Testing Score: 0.877290950744559


In [71]:
# Generate predictions
preds = gs2.predict(X_test)

In [72]:
# Create testing DataFrame
df_test = df.loc[list(X_test.index),:]

In [73]:
# Creating a dataframe with predicted disaster type from our multiclass classifier
df_test['preds disaster type'] = preds

In [74]:
# Select tweet text, disaster type and predictions
df_test[['tweet text','disaster type','preds disaster type']]

Unnamed: 0,tweet text,disaster type,preds disaster type
17640,rt @examinedisaster: @nasa releases stunning b...,0.0,0.0
3005,dramatic details emerge of lax rampage targeti...,3.0,3.0
16996,rt @estebangerbasi: un terrorista italiano es ...,3.0,3.0
1687,"photo: kirsty nelis, reported locally as polic...",3.0,3.0
20253,deadly building collapse in bangladesh http:/...,3.0,3.0
...,...,...,...
10676,colorado digs out after 'biblical' flooding: c...,4.0,4.0
1811,nasty katie hopkins apologises after making cr...,3.0,3.0
2006,rt @cloydrivers: bad day? 6 troops in afghanis...,99.0,3.0
19090,rt @fasnouvelles: vous avez des livres à donne...,3.0,3.0


### 2) Multiclass Classification Performed on Wildfire Tweets

In [75]:
# Reading in the dataframe of tweets from california
df_fire = pd.read_csv('./data/firetweets_clean.csv')
#df_fire = df_fire[['message']].rename({'message':'tweet'}, axis = 1)

In [76]:
# Display data
df_fire.head()

Unnamed: 0,tweet text,disaster type
0,california #artist danielle nelisse paints #ab...,0
1,many low income people lost homes baja califor...,0
2,rt epochtimes get act together governor see cl...,0
3,rt epochtimes get act together governor see cl...,0
4,yikes https co gs vp hh #wildfires #africa,0


In [77]:
# Get shape of firetweets
df_fire.shape

(6260, 2)

##### 55.40% of the tweets were duplicates and we dropped them

In [78]:
# There are many repetitive tweets, so let us drop and keep one
df_fire.drop_duplicates(subset='tweet text', keep='first', inplace=True)
df_fire.shape

(2792, 2)

In [79]:
# Changing all tweets to lowercase
df_fire['tweet text'] = df_fire['tweet text'].apply(lambda x: x.lower())

In [80]:
# Fitting the multiclass classifier model to the wildfire tweets
df_fire['predicted_disaster_type'] = gs2.predict(df_fire['tweet text'])

In [81]:
# Percentage of tweets based on category
df_fire['predicted_disaster_type'].value_counts(normalize=True)

3.0     0.394699
0.0     0.263610
4.0     0.202364
99.0    0.136819
5.0     0.002149
2.0     0.000358
Name: predicted_disaster_type, dtype: float64

##### There are no type 1 and only one type 2 disaster, which according to our dictionary is cyclones/hurricanes and earthquakes respectively

In [82]:
# Exploring tweets from different categories
for tweet in df_fire[df_fire['predicted_disaster_type'] == 5]['tweet text']:
    print()
    print(tweet)


#russia love california dreaming president donald trump seems willing help russia fight wi https co fy izy xqq

#president fight #california #wildfires rush help #campaign bankroll #russia https co bjqljwmtaj

#resisters remember #trump offered hel #russia fight #wildfires denies help american citiz https co bwtbtpofqw

rt ajstream bad year #wildfires globally past months parts lebanon turkey russia france greece

bad year #wildfires globally past months parts lebanon turkey russia france https co xyvwcgfuqh

rt benjonespiced mesmerising image kentphotos via ap meteor streaks across sky gusty winds create ember cast val


### Unsupervised clustering to increase dimensions

In [83]:
# Perform a train test split
X_train, X_test, y_train, y_test = train_test_split(X, y_multi, random_state = 42, stratify = y_multi)

In [84]:
# Instantiating a KMeans clustering algorithm
km = KMeans(n_clusters=5)

In [85]:
# Vectorizing the training and testing tweets
cvec= CountVectorizer(lowercase=True,stop_words='english',max_features=4000)

# Creating vectorized model variables
X_train_vec = cvec.fit_transform(X_train)
X_test_vec = cvec.transform(X_test)

In [86]:
# Standardizing the vectorized training data
sc = StandardScaler()

# Transformed the testing data
X_train_vec = sc.fit_transform(X_train_vec.toarray())
X_test_vec = sc.transform(X_test_vec.toarray())

In [87]:
# Create DataFrames for the vectorized training and testing data
df_Xvec_train = pd.DataFrame(X_train_vec)
df_Xvec_test = pd.DataFrame(X_test_vec)

In [88]:
# Fitting the clustering model on the training set
km.fit(X_train_vec)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [89]:
# Adding the cluster labels to the dataframes
df_Xvec_train['cluster'] = km.predict(X_train_vec)
df_Xvec_test['cluster'] = km.predict(X_test_vec)

In [90]:
df_Xvec_train['cluster'].unique()

array([0, 2, 4, 1, 3], dtype=int64)

In [91]:
df_Xvec_test['cluster'].unique()

array([0, 1, 2, 4, 3], dtype=int64)

In [92]:
# # Using One Hot Encoder to dummify such that number of categories remain same
# # No need to do dummy if using OHE
# ohe = OneHotEncoder(categories='auto')

# ohe.fit(df_Xvec_train[['cluster']])

# df_Xvec_train = df_Xvec_train.merge(pd.DataFrame(ohe.transform(df_Xvec_train[['cluster']]).toarray()), left_index=True, right_index=True).drop(columns=['cluster'])
# df_Xvec_test = df_Xvec_test.merge(pd.DataFrame(ohe.transform(df_Xvec_test[['cluster']]).toarray()), left_index=True, right_index=True).drop(columns=['cluster'])

In [93]:
# Dummy the training and testing cluster columns
df_Xvec_train = pd.get_dummies(df_Xvec_train, columns=['cluster'], drop_first=True)
df_Xvec_test = pd.get_dummies(df_Xvec_test, columns=['cluster'], drop_first=True)

In [94]:
# Fititng a multi class logisitic regression model
multi_model = OneVsRestClassifier(LogisticRegression(C=0.001 ,penalty = 'l2', solver='liblinear', random_state = 42))
multi_model.fit(df_Xvec_train, y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=0.001, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=42,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [95]:
# Training score
multi_model.score(df_Xvec_train, y_train)

0.9539834836985059

In [97]:
# Testing score
multi_model.score(df_Xvec_test, y_test)

0.9199599083619702