# Multiclass Classification Model

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from nltk.corpus import stopwords

## Read in data

In [45]:
df = pd.read_csv("../Data/all_tweets_clean.csv")

In [49]:
df.shape

(60348, 4)

In [48]:
df.isnull().sum()

text                   0
requesting_help    58863
disaster               0
languages              0
dtype: int64

In [46]:
# Remove NA's in disaster column
df.dropna(subset=["disaster"], inplace = True)

## Look at how many values with requesting help

In [77]:
df.groupby("disaster")["requesting_help"].value_counts(normalize = True)

disaster   requesting_help
fire       0.0                0.932271
           1.0                0.067729
floods     0.0                0.680585
           1.0                0.319415
hurricane  0.0                0.783730
           1.0                0.216270
Name: requesting_help, dtype: float64

## Split df into training and new data

In [56]:
df_train = df[df["requesting_help"].notnull()]
df_train.shape

(1485, 4)

In [57]:
df_new = df[df["requesting_help"].isnull()]
df_new.shape

(58863, 4)

## Set up X and Y variables

In [58]:
X = df_train["text"]
y = df_train["requesting_help"]

In [59]:
# check for class sizes
y.value_counts(normalize = True)

0.0    0.800673
1.0    0.199327
Name: requesting_help, dtype: float64

In [60]:
# Split the data into the training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

In [61]:
# for future if we want to add more stop words
# ('tvec', TfidfVectorizer(ngram_range = (1,2), stop_words = text.ENGLISH_STOP_WORDS.union(additional_stop_words)))


# set up pipeline
pipe = Pipeline([
            ('tvec', TfidfVectorizer(ngram_range = (1,2), stop_words = "english" )),
            ("svc", SVC(gamma = "scale"))

])

# param options
params = {}

# run gridsearch
gs = GridSearchCV(pipe, params, cv=5, n_jobs= 3)

In [62]:
X_train.shape

(1113,)

In [63]:
y_train.shape

(1113,)

In [64]:
# run model
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 2),
                                          

In [65]:
gs.score(X_train, y_train)

0.9928122192273136

In [66]:
gs.score(X_test, y_test)

0.8064516129032258

## Make Predictions

In [69]:
X_new = df_new["text"]

In [68]:
df_new.head()

Unnamed: 0,text,requesting_help,disaster,languages
485,he can t nuke the hurricane from poland,,hurricane,en
505,y all when i just left to go grab dinner like ...,,hurricane,en
506,is about to fuck shit up like,,hurricane,en
507,me after shopping at publix today for food i n...,,hurricane,en
508,cat thats the projection of the category at la...,,hurricane,en


In [79]:
df_new.loc[:, "requesting_help"] = gs.predict(X_new).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [80]:
df_new.groupby("disaster")["requesting_help"].value_counts(normalize = True)

disaster   requesting_help
fire       0                  0.999896
           1                  0.000104
floods     0                  0.992105
           1                  0.007895
hurricane  0                  0.999797
           1                  0.000203
Name: requesting_help, dtype: float64

In [81]:
df_new[df_new["requesting_help"] == 1]

Unnamed: 0,text,requesting_help,disaster,languages
1363,due to there were blood drive cancellations do...,1,hurricane,en
2091,is hosting a second webinar on sept at pm et r...,1,hurricane,en
3573,now if you need help on,1,hurricane,en
3881,after the storm subsided and the waters began ...,1,hurricane,en
13647,what about vieques dios mio they need help,1,hurricane,en
17417,who is in and also need help,1,hurricane,en
30659,flood victims in nebraska need your help,1,floods,en
30661,flood victims in nebraska need your help,1,floods,en
30693,i came here to bring the floods of nebraska to...,1,floods,en
31798,want to help members of our industry impacted ...,1,fire,en
