# ML Pipeline Preparation
### 1. Import libraries and load data from database. Template as provided by Udacity.
- Import Python libraries

In [183]:
# import libraries
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('brown')
import pandas as pd
import numpy as np
import pickle as pkl
from sqlalchemy import create_engine
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV

import gensim
from gensim.models import Word2Vec
from nltk.corpus import brown
from gensim.models.doc2vec import TaggedDocument, Doc2Vec

from pandarallel import pandarallel
pandarallel.initialize(progress_bar = False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vinaymaddali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vinaymaddali/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     /Users/vinaymaddali/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [160]:
# load data from database
engine = create_engine('sqlite:///response.db')
df = pd.read_sql_table('messages', engine)
X = df['message']
category_cols = []
for col in df.columns:
    if col not in ['id', 'message', 'original', 'genre']:
        category_cols.append(col)

Y = df[category_cols].values
print(Y.shape)

(26180, 36)


### 2. Tokenization function to process text data

In [161]:
def tokenize(text):
    """
    Function to tokenize input text data. Called in pipeline sentence by sentence.
    Input: text: sentence as string.
    return: clean_tokens: list of clean tokens in sentence.
    """
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)
    
    return clean_tokens

### 3. ML pipeline
CountVectorizer, tokenize and TfidfTransformer. Used RandomForestClassifier

In [162]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [163]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
pipeline.fit(X_train, y_train);

In [164]:
y_test.shape

(8640, 36)

### 5. Test your model
Prints macro averaged f1 score, precision and recall for each output category of the dataset.

In [None]:
def get_results(category_cols, y_test, preds):
    """
    Function to calculate f1-score, precision and recall of each category.
    Uses classification report function of scikit learn and reports the macro average which is the non-weighted
    average of each category.
    Input: category_cols: category columns in data as list
           y_test: Labels as numpy array
           preds: Output predictions from classifier
    Output: dict: results dictionary from classification report function.
    """
    results = dict()
    avg_calc = {'f1_score': [], 'precision': [], 'recall': []}
    print("Category : f1-score, precision, recall\n")
    for ix, col in enumerate(category_cols):
        results[col] = classification_report(y_test[:,ix], preds[:, ix], output_dict=True)
        f1_score, precision, recall = results[col]['macro avg']['f1-score'], results[col]['macro avg']['precision'], \
                                      results[col]['macro avg']['recall']
        avg_calc['f1_score'].append(f1_score)
        avg_calc['precision'].append(precision)
        avg_calc['recall'].append(recall)
        print("{} : {}, {}, {}".format(col, f1_score, precision, recall))
    
    print("\n\n\n")
    print("Avg. across categories:")
    print("f1-score: {}".format(np.mean(avg_calc['f1_score'])))
    print("precision: {}".format(np.mean(avg_calc['precision'])))
    print("recall: {}".format(np.mean(avg_calc['recall'])))
    return results

In [170]:
preds = pipeline.predict(X_test)

results = get_results(category_cols, y_test, preds)

Category : f1-score, precision, recall

related : 0.471014583506018, 0.7993475368313055, 0.43857022141339486
request : 0.7530020515096362, 0.898842806108806, 0.7019529367646014
offer : 0.4988108358953536, 0.4976273148148148, 0.5
aid_related : 0.7545212596325008, 0.7785532216062954, 0.7478049955725492
medical_help : 0.5208541985680587, 0.8188781444189679, 0.5210477984184175
medical_products : 0.5338464016085032, 0.8476721235341924, 0.5239055751734342
search_and_rescue : 0.5088013674353031, 0.8193768820940468, 0.5080441467831921
security : 0.49503214494447695, 0.49016203703703703, 0.5
military : 0.5086469769822447, 0.7962233549582947, 0.5085617040369255
child_alone : 1.0, 1.0, 1.0
water : 0.6339220130440816, 0.9224057014493483, 0.5877185261303381
food : 0.7544277291656918, 0.8970943543287032, 0.6968498759341242
shelter : 0.6446019706082937, 0.9088085377508761, 0.5991878109676504
clothing : 0.5330899508660207, 0.6854686984511953, 0.5198555543146133
money : 0.5239643330945809, 0.8640523632

  _warn_prf(average, modifier, msg_start, len(result))


death : 0.5790585562072036, 0.9129144851657941, 0.5496363636363637
other_aid : 0.4807419825672813, 0.7763900174114916, 0.50710559685044
infrastructure_related : 0.48391021976666004, 0.7157328085204908, 0.5007810353632779
transport : 0.5751045765369109, 0.8834825971659267, 0.5475554804983458
buildings : 0.5405719938631587, 0.9242160278745645, 0.5284787993759225
electricity : 0.5239419925253087, 0.9072272411396803, 0.5148219344394982
tools : 0.4986363372599083, 0.4972800925925926, 0.5
hospitals : 0.4971774428213932, 0.49438657407407405, 0.5
shops : 0.4983743613562471, 0.49675925925925923, 0.5
aid_centers : 0.4966501602097291, 0.4933449074074074, 0.5
other_infrastructure : 0.488484992007578, 0.47748842592592594, 0.5
weather_related : 0.8195838509185903, 0.8727951671177906, 0.7914176504414301
floods : 0.7535971697109529, 0.9399954933001691, 0.6863053319919517
storm : 0.7383125005260178, 0.8808574879227054, 0.681616289953832
fire : 0.5075449182271377, 0.74455892567724, 0.5052046441440055
ea

### 6. Can improve using Grid search of parameters

In [171]:
parameters = {
    'clf__estimator__n_estimators': [100, 250],
    'clf__estimator__max_depth': [None, 2, 4]
}

rf_cv = GridSearchCV(pipeline, param_grid=parameters, n_jobs=-1)
rf_cv.fit(X_train, y_train);

### 7. Test model

In [172]:
preds = rf_cv.predict(X_test)
results_rf_cv = get_results(category_cols, y_test, preds)

Category : f1-score, precision, recall

related : 0.4639718473548342, 0.7966512506678711, 0.4346759718234905
request : 0.7539916695591808, 0.896097169519821, 0.7033097767403969
offer : 0.4988108358953536, 0.4976273148148148, 0.5
aid_related : 0.7620068372909544, 0.7836365236202745, 0.7553016917959576
medical_help : 0.520591681836294, 0.7950552646887725, 0.5208591666276731
medical_products : 0.546593457294148, 0.8490578567278161, 0.5310833237978911
search_and_rescue : 0.5009015288110426, 0.8192659488248234, 0.5040220733915961
security : 0.4950026301946344, 0.4901608982521125, 0.4999409681227863
military : 0.5284805653710247, 0.8769169620085455, 0.519051214526436
child_alone : 1.0, 1.0, 1.0
water : 0.6239478264516535, 0.9096874870819727, 0.5811493514070077
food : 0.7513985298850314, 0.8950175187012945, 0.6941225424854579
shelter : 0.6779423298089223, 0.8954502714607555, 0.6252656155985071
clothing : 0.5401498174863141, 0.7075038918883111, 0.5239205949650197
money : 0.5142502837996911, 0.

  _warn_prf(average, modifier, msg_start, len(result))


transport : 0.5582138444478777, 0.8735208458253283, 0.5374895410336913
buildings : 0.5267382145822442, 0.8738247243180499, 0.5209253874949027
electricity : 0.5182988000382496, 0.8905037637521714, 0.5118457439633077
tools : 0.4986363372599083, 0.4972800925925926, 0.5
hospitals : 0.4971774428213932, 0.49438657407407405, 0.5
shops : 0.4983743613562471, 0.49675925925925923, 0.5
aid_centers : 0.4966501602097291, 0.4933449074074074, 0.5
other_infrastructure : 0.488484992007578, 0.47748842592592594, 0.5
weather_related : 0.8226000849471748, 0.8733420554078859, 0.7950627473531103
floods : 0.7437124253708057, 0.9360148041467413, 0.677521524495812
storm : 0.7714206492944775, 0.8912245613942994, 0.7146109718149678
fire : 0.4972067039106145, 0.49450167843500403, 0.4999414862492686
earthquake : 0.8953223583583294, 0.9302476905029571, 0.8667111692907158
cold : 0.5470505617977528, 0.9900347624565469, 0.5274725274725275
other_weather : 0.5115599580135088, 0.8982710809533575, 0.5124065374548745
direct_

### Not a big improvement. Try other classifiers.

## Different classifiers

### 8.1 Adaboost - shows some improvements

In [173]:
# Adaboost
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(AdaBoostClassifier()))
])

parameters = {
    'clf__estimator__n_estimators': [50,100,200],
    'clf__estimator__learning_rate': [0.1, 0.5, 1.0]
}

adb_cv = GridSearchCV(pipeline, param_grid=parameters, n_jobs=-1)
adb_cv.fit(X_train, y_train);

preds = adb_cv.predict(X_test)
results_adb_cv = get_results(category_cols, y_test, preds)

Category : f1-score, precision, recall

related : 0.5265875309990461, 0.6586718431888509, 0.48417719526693653
request : 0.7907862460445884, 0.8557361402699644, 0.7534624616481147
offer : 0.5194986072423398, 0.5691118796644107, 0.5118462441747339
aid_related : 0.7513618879001609, 0.7713954171748223, 0.7452081333591691
medical_help : 0.6251327524069452, 0.7629357306454853, 0.590208401572224
medical_products : 0.7016789267406992, 0.8209652002295817, 0.6507948123646743
search_and_rescue : 0.602004716981132, 0.7210741389375365, 0.5690462009699652
security : 0.5115789473684211, 0.640324449594438, 0.5084103062712688
military : 0.6719393743144606, 0.7738609052471608, 0.6277671933046604
child_alone : 1.0, 1.0, 1.0
water : 0.8263652915496049, 0.8480452200478428, 0.8076250190200722
food : 0.8634927122821303, 0.8864838154559547, 0.8438416984973114
shelter : 0.7930081402921406, 0.8579437595956534, 0.7513163339350288
clothing : 0.7249067633455244, 0.8096766127687948, 0.6773354295712736
money : 0.682

### 8.2 XGBoost - does even better

In [174]:
# XGBoost
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(XGBClassifier()))
])

parameters = {
    'clf__estimator__n_estimators': [100, 250],
    'clf__estimator__max_depth': [None, 2, 4]
}

xgb_cv = GridSearchCV(pipeline, param_grid=parameters, n_jobs=-1)
xgb_cv.fit(X_train, y_train);

preds = xgb_cv.predict(X_test)
results_xgb_cv = get_results(category_cols, y_test, preds)

Category : f1-score, precision, recall

related : 0.6274757975918573, 0.7848652172641625, 0.5667525792298598
request : 0.8016831873472638, 0.8520292908553305, 0.7693782801392477
offer : 0.4987817612252001, 0.49762704016668596, 0.4999418537039191
aid_related : 0.7561275413624406, 0.76808130666566, 0.7511659728694315
medical_help : 0.6715746538621982, 0.7742938626755876, 0.6310742361143605
medical_products : 0.7035157911769848, 0.8001779359430605, 0.6571219108493817
search_and_rescue : 0.6147405660377359, 0.747974314068885, 0.5774477033219482
security : 0.5167400665633026, 0.6442251963870139, 0.5112334189874297
military : 0.7000115782383898, 0.7995964844852688, 0.6523624209164071
child_alone : 1.0, 1.0, 1.0
water : 0.8422657262653125, 0.8685843683869512, 0.8199715193608983
food : 0.8769596278083266, 0.8919572964140037, 0.8634136556681365
shelter : 0.8092521430682825, 0.848771375030255, 0.7794056207558975
clothing : 0.7445476897531692, 0.8223467600700525, 0.697660632823306
money : 0.67916

  _warn_prf(average, modifier, msg_start, len(result))


### Try another feature extractor

### 8.3 word2vec feature - potential for improvement, might need more external data to train model.

In [175]:
import string
def test_tokenize(text):
    """
    New tokenize function to support doc2vec function.
    Reference: https://www.oreilly.com/library/view/applied-text-analysis/9781491963036/ch04.html
    """
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()

    for token in nltk.word_tokenize(text):
        if token in string.punctuation: continue
        yield stem.stem(token)

corpus = [list(test_tokenize(doc)) for doc in X_train]
corpus = [
    TaggedDocument(words, ['d{}'.format(idx)])
    for idx, words in enumerate(corpus)
]

model = Doc2Vec(corpus, size=100, min_count=0)



In [177]:
def conv(x, mdl):
    """
    Function to infer wordvec from input
    """
    print(x)
    return mdl.infer_vector(x)

class gensim_word2vec():
    """
    word2vec class to be input to pipeline. Has a fit and transform as required by scikit-learn pipeline.
    Reference: https://www.oreilly.com/library/view/applied-text-analysis/9781491963036/ch04.html
    """
    
    def __init__(self):
        self.model = None
    
    def fit(self, X, y):
        """
        Tokenizes data and other preprocessing before the Doc2Vec class is used to train the model.
        """
        # Preprocessing 
        corpus = [list(test_tokenize(doc)) for doc in X]
        corpus = [
            TaggedDocument(words, ['d{}'.format(idx)])
            for idx, words in enumerate(corpus)
        ]
        self.model = Doc2Vec(corpus, size=100, min_count=0)
        return self
    
    
    
    def transform(self, X):
        """
        Uses the trained model in fit to get the vector for token.
        """
        
        corpus = [list(test_tokenize(doc)) for doc in X]
        
        count_miss = 0
        out = []
        for doc in corpus:
            try:
                out.append(self.model.infer_vector(doc))
            except:
                print(w)
                count_miss+=1
                out.append(np.zeros((100)))
         
        print(count_miss)
        
        return out

In [178]:
pipeline = Pipeline([
    ('word2vec', gensim_word2vec()),
    ('clf', MultiOutputClassifier(XGBClassifier()))
])

parameters = {
    'clf__estimator__n_estimators': [100, 250],
    'clf__estimator__max_depth': [None, 2, 4]
}

wv_cv = GridSearchCV(pipeline, param_grid=parameters, n_jobs=-1)
wv_cv.fit(X_train, y_train);

0


In [179]:
preds = wv_cv.predict(X_test)
results_wv_cv = get_results(category_cols, y_test, preds)

0
Category : f1-score, precision, recall

related : 0.4608824725321503, 0.7858028604368527, 0.4268743616317694
request : 0.6978706560475011, 0.780802118006706, 0.6646345127603811
offer : 0.4988108358953536, 0.4976273148148148, 0.5
aid_related : 0.6615039226560255, 0.6771376510948488, 0.65898408890567
medical_help : 0.5099974269912964, 0.7848354773003787, 0.5152338449300454
medical_products : 0.5354598562787571, 0.7018498238545796, 0.5251234507124084
search_and_rescue : 0.4967187316500294, 0.585871453387377, 0.50180257928067
security : 0.49503214494447695, 0.49016203703703703, 0.5
military : 0.4949727019950884, 0.6084993052339046, 0.5015686970439185
child_alone : 1.0, 1.0, 1.0
water : 0.5318638475338431, 0.7158128225741143, 0.5248509611021611
food : 0.5706472353763159, 0.7148419713332709, 0.5545479135733171
shelter : 0.5459651701349313, 0.6955083032873142, 0.5363031306159084


  _warn_prf(average, modifier, msg_start, len(result))


clothing : 0.5188531192681067, 0.6430475086906141, 0.5117841791309776
money : 0.5313786548014091, 0.6891531322505801, 0.5198023409391083
missing_people : 0.49746990054091783, 0.49496527777777777, 0.5
refugees : 0.508130419202584, 0.7541272031943025, 0.5088735781361569
death : 0.6143890500967757, 0.7951774588467558, 0.5760233100233101
other_aid : 0.4915656016615112, 0.6296064238254877, 0.5109877576394343
infrastructure_related : 0.49852356949598364, 0.7161832946635731, 0.5078103536327784
transport : 0.5029704627227847, 0.7082237340728126, 0.5071704370120749
buildings : 0.5685035360024076, 0.7395227324955551, 0.5463227846768856
electricity : 0.5219085713382292, 0.6376073565225696, 0.514172737083502
tools : 0.4986072423398329, 0.4972797777520546, 0.49994181310368907
hospitals : 0.49714817832615527, 0.4943859242967936, 0.49994147255062626
shops : 0.4983161073046104, 0.4967585089141005, 0.499883504193849
aid_centers : 0.4966501602097291, 0.4933449074074074, 0.5
other_infrastructure : 0.5007

### 9. Export your model as a pickle file

In [186]:
pkl.dump(xgb_cv, open('xgb_classifier.pkl', 'wb'))
pkl.dump(wv_cv, open('wv_classifier.pkl', 'wb'))
pkl.dump(adb_cv, open('adb_classifier.pkl', 'wb'))
pkl.dump(rf_cv, open('rf_classifier.pkl', 'wb'))