# Multiclass Classification Model

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier


## Read in data

In [26]:
df = pd.read_csv("../Data/all_tweets_clean.csv")

In [27]:
df.shape

(60352, 4)

In [28]:
df.isnull().sum()

text                   0
requesting_help    58867
disaster               4
languages              0
dtype: int64

In [29]:
# Remove NA's in disaster column
df.dropna(subset=["disaster"], inplace = True)

In [30]:
df.reset_index(drop=True,inplace=True)

## Look at how many values with requesting help

In [31]:
df.groupby("disaster")["requesting_help"].value_counts(normalize = True)

disaster   requesting_help
fire       0.0                0.932271
           1.0                0.067729
floods     0.0                0.680585
           1.0                0.319415
hurricane  0.0                0.783730
           1.0                0.216270
Name: requesting_help, dtype: float64

dummying disaster column

In [32]:
df = pd.get_dummies(df, columns=['disaster'],drop_first=True)

## Split df into training and new data

In [33]:
df_train = df[df["requesting_help"].notnull()]
df_train.shape

(1485, 5)

In [34]:
df_test = df[df["requesting_help"].isnull()]
df_test.shape

(58863, 5)

In [35]:
target = df_train['requesting_help']

In [36]:
floods = df_train['disaster_floods']

In [37]:
hurricane = df_train['disaster_hurricane']

In [55]:
cvec = CountVectorizer(stop_words='english', min_df=5, max_df=1.0, ngram_range=(1, 2))

In [38]:
tvec = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))

In [None]:
tvec = TfidfVectorizer(stop_words='english', min_df=5, max_df=1.0, ngram_range=(1, 2))

In [56]:
term_mat = cvec.fit_transform(df_train['text'])

In [39]:
term_mat = tvec.fit_transform(df_train['text'])

In [57]:
term_df = pd.DataFrame(term_mat.toarray(), columns=cvec.get_feature_names())

In [40]:
term_df = pd.DataFrame(term_mat.toarray(), columns=tvec.get_feature_names())

In [58]:
term_df.insert(0, 'requesting_help', target.values)

In [59]:
term_df.insert(1, 'disaster_floods', floods.values)

In [60]:
term_df.insert(2, 'disaster_hurricane', hurricane.values)

## Set up X and Y variables

In [61]:
X = term_df.drop('requesting_help',axis=1)
y = term_df["requesting_help"]

In [62]:
# check for class sizes
y.value_counts(normalize = True)

0.0    0.800673
1.0    0.199327
Name: requesting_help, dtype: float64

In [63]:
# Split the data into the training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

In [71]:
np.sqrt(len(X))

38.535697735995385

In [75]:
list(np.random.choice(X.columns, size=39, replace=False))

['provide',
 'florida',
 'states need',
 'understand',
 'team',
 'fighting',
 'owners',
 'major',
 'hit devastating',
 'black',
 'burn scar',
 'prepare',
 've',
 'cleanup',
 'talked',
 'national',
 'gas',
 'word',
 'fight',
 'tornadoes',
 'record',
 'st',
 'works',
 'donating',
 'meet',
 'friends',
 'burned',
 'support',
 'america',
 'utility',
 'yesterday',
 'know',
 'suffered',
 'evacuations',
 'heartland',
 'map',
 'nation',
 'country',
 'overseeing']

In [76]:
rf = RandomForestClassifier(n_estimators=100, max_depth=None, max_features='auto')

In [77]:
cross_val_score(rf, X_train, y_train, cv=5).mean()

0.8167036030264729

In [78]:
cross_val_score(rf, X_test, y_test, cv=5).mean()

0.8091393311119338

In [79]:
preds = rf.predict(X_test)

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [70]:
# Generate confusion matrix.
# Documentation here: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
# tn, fp  positive = asking for help
# fn, tp  negative = not asking for help
confusion_matrix(y_test, # True values.
                 preds)  # Predicted values.

array([[298,   0],
       [ 74,   0]])

In [53]:
prac = pd.DataFrame({'practice':preds})

In [54]:
prac['practice'].value_counts()

0.0    372
Name: practice, dtype: int64

## Make Predictions

In [None]:
X_new = df_new["text"]

In [None]:
df_new.head()

In [None]:
df_new.loc[:, "requesting_help"] = gs.predict(X_new).astype(int)

In [None]:
df_new.groupby("disaster")["requesting_help"].value_counts(normalize = True)

In [None]:
df_new[df_new["requesting_help"] == 1]