# Multiclass Classification Model

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

## Read in data

In [2]:
df = pd.read_csv("../Data/all_tweets_clean.csv")

In [3]:
# drop null values
df.dropna(inplace = True)

## Set up X and Y variables

In [4]:
X = df["text"]
y = df["disaster"]

In [5]:
# check for class sizes
y.value_counts(normalize = True)

hurricane    0.500445
fire         0.485239
floods       0.014316
Name: disaster, dtype: float64

In [6]:
# Split the data into the training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

In [9]:
# for future if we want to add more stop words
# ('tvec', TfidfVectorizer(ngram_range = (1,2), stop_words = text.ENGLISH_STOP_WORDS.union(additional_stop_words)))


# set up pipeline
pipe = Pipeline([
            ('tvec', TfidfVectorizer(ngram_range = (1,2), stop_words = "english" )),
            ("svc", SVC(gamma = "scale"))

])

# param options
params = {}

# run gridsearch
gs = GridSearchCV(pipe, params, cv=5, n_jobs= 3)

In [10]:
X_train.shape

(44687,)

In [11]:
y_train.shape

(44687,)

In [12]:
# run model
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 2),
                                          

In [13]:
gs.score(X_train, y_train)

0.9950544901201692

In [16]:
gs.score(X_test, y_test)

0.8923872180451128

## Pickle model for future use

In [17]:
pickle.dump(gs, open("../assets/tfidf_svc.sav", "wb"))