In [1]:
# Import libraries
import pandas as pd
import numpy as np
import nltk

In [2]:
# Reading in the compiled csv of all different tweets
df = pd.read_csv('./datasets/compiled/all_disaster_tweets.csv')

In [3]:
# Dropping NaNs
df.dropna(inplace=True)

In [4]:
# Turn all the tweets into lowercase before vectorization
df['tweet text'] = df['tweet text'].apply(lambda x: x.lower())

In [5]:
# Creating a column identifying floods and no floods to create a one-class classifier
df['flood'] = df['disaster type'].apply(lambda x: 1 if x==4 else 0)

In [6]:
# Creating the X and y lists before splitting
X = df['tweet text']
y = df['flood']

In [7]:
# Machine learning relevant libraries
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import StratifiedShuffleSplit,GroupShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import KernelPCA

In [8]:
# Train test split for single classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 42)

In [9]:
# Pipeline for single classifier
pipe1 = Pipeline([
    ('cvec',CountVectorizer()),
    ('lr',LogisticRegression(penalty = 'l2', solver='liblinear', random_state = 42))
])

pipe1_params = {
    'cvec__max_features':[2000,4000],
    'cvec__lowercase':[True],
    'cvec__stop_words':['english'],
    'cvec__ngram_range':[(1, 1), (1, 2)],
}

In [10]:
# Grid searching for single classifier
gs1 = GridSearchCV(pipe1,
                  pipe1_params,
                  cv=3,
                  return_train_score=True) 

In [11]:
gs1.fit(X_train, y_train);

In [12]:
gs1.best_params_

{'cvec__lowercase': True,
 'cvec__max_features': 4000,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english'}

In [13]:
gs1.score(X_train, y_train)

0.9896415103346222

In [14]:
gs1.score(X_test, y_test)

0.9782359679266895

### Creating a multiclass classifier to categorize the tweets based on 5 different disaster types

In [15]:
# Creating the y vairable as a vector of different disaster types
y_multi = df['disaster type']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y_multi, random_state = 42, stratify = y_multi)

In [17]:
pipe2 = Pipeline([
    ('tvec',CountVectorizer()),
    ('multi',OneVsRestClassifier(LogisticRegression(C = 0.01, penalty = 'l2', solver='liblinear', random_state = 42)))
])

pipe2_params = {
    'tvec__max_features':[4000],
    'tvec__lowercase':[True],
    'tvec__stop_words':['english'],
    'tvec__ngram_range':[(1, 2)]
}

In [18]:
gs2 = GridSearchCV(pipe2,
                  pipe2_params,
                  cv=3,
                  return_train_score=True) 

In [19]:
gs2.fit(X_train, y_train);

In [20]:
gs2.best_params_

{'tvec__lowercase': True,
 'tvec__max_features': 4000,
 'tvec__ngram_range': (1, 2),
 'tvec__stop_words': 'english'}

In [21]:
gs2.score(X_train, y_train)

0.8830493102296052

In [22]:
gs2.score(X_test, y_test)

0.877290950744559

In [23]:
preds = gs2.predict(X_test)

In [24]:
df_test = df.loc[list(X_test.index),:]

In [25]:
# Creating a dataframe with predicted disaster type from our multiclass classifier
df_test['preds disaster type'] = preds

In [26]:
df_test[['tweet text','disaster type','preds disaster type']]

Unnamed: 0,tweet text,disaster type,preds disaster type
17640,rt @examinedisaster: @nasa releases stunning b...,0.0,0.0
3005,dramatic details emerge of lax rampage targeti...,3.0,3.0
16996,rt @estebangerbasi: un terrorista italiano es ...,3.0,3.0
1687,"photo: kirsty nelis, reported locally as polic...",3.0,3.0
20253,deadly building collapse in bangladesh http:/...,3.0,3.0
...,...,...,...
10676,colorado digs out after 'biblical' flooding: c...,4.0,4.0
1811,nasty katie hopkins apologises after making cr...,3.0,3.0
2006,rt @cloydrivers: bad day? 6 troops in afghanis...,99.0,3.0
19090,rt @fasnouvelles: vous avez des livres à donne...,3.0,3.0


### Using our multiclass model to classify latest california tweets based on relevance and disaster type

In [27]:
# Reading in the dataframe of tweets from california
df_fire = pd.read_csv('./datasets/compiled/firetweets_clean.csv')
df_fire = df_fire[['message']].rename({'message':'tweet'}, axis = 1)

In [28]:
df_fire.shape

(6260, 1)

In [29]:
# There are many repetitive tweets, so let us drop and keep one
df_fire.drop_duplicates(subset='tweet',keep='first', inplace=True)
df_fire.shape

(2792, 1)

##### 55.40% of the tweets were duplicates and we dropped them

In [30]:
# Changing all tweets to lowercase
df_fire['tweet'] = df_fire['tweet'].apply(lambda x: x.lower())

In [31]:
# Fitting the multiclass classifier model to the CA tweets
df_fire['predicted_disaster_type'] = gs2.predict(df_fire['tweet'])

In [32]:
# Percentage of tweets based on category
df_fire['predicted_disaster_type'].value_counts(normalize=True)

3.0     0.394699
0.0     0.263610
4.0     0.202364
99.0    0.136819
5.0     0.002149
2.0     0.000358
Name: predicted_disaster_type, dtype: float64

##### There are no type 1 and type 2 disaster, which according to our dictionary is cyclones/hurricanes and earthquakes respectively

In [33]:
# Printing out random tweets from differnet categories
for tweet in df_fire[df_fire['predicted_disaster_type'] == 99]['tweet']:
    print(tweet)

california #artist danielle nelisse paints #abstract landscape #paintings #daniellenelisse #wildfires thank yo https co szgft yw
yikes https co gs vp hh #wildfires #africa
business usual wildfires control california https co ydcybs xlz visitca #california https co lrf oye hr
ready fall protecting home #wildfires season tell us tips https co g wvaewfsa
gotts love john cena #wildfires #johncena #life https co vj fdmpyi
getanalysis potus #trump accuses #california gross #mismanagementofforests #forestlands https co u j ype l
trump threatens pull federal aid california #wildfires https co yyhckrghup https co ypykexcv
#california gov #gavinnewsom national ambitions https co vpn qfr #californiawildfires #wildfires https co k kn qj dw
ugh actually expect us believe #isis starting #california #wildfires actually gross https co ghayixlvng
far wildfires california including #kinkaidfire #gettyfire https co whr ucykf
experts participated firex aq summer large interagency study better understand c

### Unsupervised clustering to increase dimensions

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y_multi, random_state = 42, stratify = y_multi)

In [35]:
# Instantiating a Kmeans cluster 
km = KMeans(n_clusters=5)

In [36]:
# Vectorizing the training and testing tweets
cvec= CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,2), max_features=4000)
X_train_vec = cvec.fit_transform(X_train)
X_test_vec = cvec.transform(X_test)

In [37]:
# Standardizing the vectorized training data and trasnforming the testing data 
sc = StandardScaler()
X_train_vec = sc.fit_transform(X_train_vec.toarray())
X_test_vec = sc.transform(X_test_vec.toarray())

In [38]:
# Turning things into dataframes
df_Xvec_train = pd.DataFrame(X_train_vec)
df_Xvec_test = pd.DataFrame(X_test_vec)

In [39]:
# Fitting the clustering model on the training set
km.fit(X_train_vec)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [40]:
# Adding the cluster labels to the dataframes
df_Xvec_train['cluster'] = km.predict(X_train_vec)
df_Xvec_test['cluster'] = km.predict(X_test_vec)

In [41]:
df_Xvec_train['cluster'].unique()

array([1, 3, 4, 0, 2])

In [42]:
df_Xvec_test['cluster'].unique()

array([1, 0, 4, 2, 3])

In [43]:
# # Using One Hot Encoder to dummify such that number of categories remain same
# # No need to do dummy if using OHE
# ohe = OneHotEncoder(categories='auto')

# ohe.fit(df_Xvec_train[['cluster']])

# df_Xvec_train = df_Xvec_train.merge(pd.DataFrame(ohe.transform(df_Xvec_train[['cluster']]).toarray()), left_index=True, right_index=True).drop(columns=['cluster'])
# df_Xvec_test = df_Xvec_test.merge(pd.DataFrame(ohe.transform(df_Xvec_test[['cluster']]).toarray()), left_index=True, right_index=True).drop(columns=['cluster'])

In [44]:
# Dummy the training and testing cluster columns
df_Xvec_train = pd.get_dummies(df_Xvec_train,columns=['cluster'], drop_first=True)
df_Xvec_test = pd.get_dummies(df_Xvec_test,columns=['cluster'], drop_first=True)

In [45]:
# Fititng a multi class logisitic regression model
multi_model = OneVsRestClassifier(LogisticRegression(C=0.001 ,penalty = 'l2', solver='liblinear', random_state = 42))
multi_model.fit(df_Xvec_train, y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=0.001, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=42,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [46]:
# Training score
multi_model.score(df_Xvec_train, y_train)

0.9501646856651869

In [47]:
# Testing score
multi_model.score(df_Xvec_test, y_test)

0.922823596792669