## SGD Categorization Implementation
### Load Data

In [27]:
import numpy as np
import pandas as pd

train_data_raw = pd.read_csv("./data/train.csv", encoding="ISO-8859-1") #utf-8 doesn't play nice
test_data_raw = pd.read_csv("./data/test.csv", encoding="ISO-8859-1")
disaster_type_none_label = 'none'
train_data = train_data_raw.replace(np.nan, disaster_type_none_label, regex=True)
train_data.drop_duplicates(subset=["text", "disaster", "disaster_type"], keep="first")
train_data.head()



Unnamed: 0,tweetid,text,disaster_type,disaster,Unnamed: 4
0,10001,@TheEllenShow Please check into Salt River hor...,none,0,none
1,10002,"As for the hurricane, it's already category 1 ...",hurricane,1,none
2,10003,So it looks like my @SoundCloud profile shall ...,none,0,none
3,10004,@SushmaSwaraj Am sure background check of the ...,none,0,none
4,10005,Open forex detonation indicator is irretrievab...,none,0,none


### Train Test split

In [28]:
from sklearn.model_selection import train_test_split
train, valid = train_test_split(train_data, test_size=0.25, random_state=0)
train.head()

Unnamed: 0,tweetid,text,disaster_type,disaster,Unnamed: 4
1308,11309,ladies and gentlemen meet your new prime minis...,none,0,none
12427,22428,What if every 5000 wins in ranked play gave yo...,none,0,none
11035,21036,Fair enough we have two of the best attacking ...,none,0,none
1580,11581,Should I go outside after an #earthquake?,earthquake,1,none
4121,14122,I liked a @YouTube video https://t.co/vJSOOgoN...,none,0,none


### Disaster Type SGDC Classifier with Grid Search

In [33]:
import contractions
import re
import string
from sklearn import metrics
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

is_printable = set(string.printable)
def preprocessor(string):
    urlless = re.sub(r"http\S+", " ", string)
    u_to_you = re.sub(r" u ", " you ", urlless)
    numericOrdinalless = re.sub(r"\d+(st|nd|rd|th)", " ", u_to_you)
    numberless = re.sub(r"\d+", " ", numericOrdinalless)
    contractionless = contractions.fix(numberless)
    only_alpha_whitespace = re.sub(r"[^a-zA-Z\s]", " ", contractionless)
    return only_alpha_whitespace

tweet_tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def tokenizer(text):
    words = tweet_tokenizer.tokenize(text)
    stopwordless = [w for w in words if w not in stop_words]
    lemmatized = [lemmatizer.lemmatize(w) for w in stopwordless]
    return lemmatized

parameters = {
    #'clf__alpha': (1e-2, 1e-3),
    #'clf__fit_intercept': (True, False),
    #'clf__loss': ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 
    #         'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'),
    #'clf__penalty': ('l2', 'l1', 'elasticnet'),
    #'tfidf__norm': ('l1', 'l2' or None),
    #'tfidf__smooth_idf': (True, False),
    #'tfidf__sublinear_tf': (True, False),
    #'tfidf__use_idf': (True, False),
    #'vect__lowercase': (False, True),
    #'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    #'vect__stop_words': ('english', stopwords.words('english'), None),
    #'vect__strip_accents': ('ascii', 'unicode', None),
    #'vect__tokenizer': (tweet_tokenizer.tokenize, None),
    #'vect__token_pattern': (u"(?ui)\\b\\w*[a-z]+\\w*\\b", r"(?u)\b\w\w+\b", None)
}
disaster_type_sgdc_classifier = Pipeline([
    ('vect', CountVectorizer(analyzer='word', tokenizer=tweet_tokenizer.tokenize)),
    ('tfidf', TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False, use_idf=False)),
    ('clf', SGDClassifier(alpha=.001, loss='modified_huber', penalty='l2', max_iter=20, random_state=42)),
    ])

grid_search_disaster_type_sgdc_classifier = GridSearchCV(disaster_type_sgdc_classifier, parameters, cv=5, n_jobs=4)\
    .fit(train["text"], train["disaster_type"])

print("Best parameters")
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, grid_search_disaster_type_sgdc_classifier.best_params_[param_name]))

grid_search_sgdc_disaster_type_predictions = grid_search_disaster_type_sgdc_classifier.predict(valid["text"])
print(metrics.classification_report(valid["disaster_type"], grid_search_sgdc_disaster_type_predictions))

Best parameters
              precision    recall  f1-score   support

  earthquake       0.99      0.97      0.98       295
        fire       0.89      0.23      0.36        35
       flood       0.92      0.85      0.88       137
   hurricane       1.00      0.76      0.86       255
        none       0.96      1.00      0.98      2783

    accuracy                           0.97      3505
   macro avg       0.95      0.76      0.81      3505
weighted avg       0.97      0.97      0.96      3505

