In [None]:
# import zipfile
# # download GloVe word vector representations
# # bunch of small embeddings - trained on 6B tokens - 822 MB download, 2GB unzipped
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# zipped = zipfile.ZipFile('glove.6B.zip')
# zipped.extractall()

# # and a single behemoth - trained on 840B tokens - 2GB compressed, 5GB unzipped
# !wget http://nlp.stanford.edu/data/glove.840B.300d.zip
# zipped = zipfile.ZipFile('glove.840B.300d.zip')
# zipped.extractall()

In [41]:
from tabulate import tabulate
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import numpy as np
import os
import struct
from gensim.models.word2vec import Word2Vec
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedShuffleSplit

TRAIN_SET_PATH = "../../cleaned_data/cleaned_text.pkl"

GLOVE_6B_50D_PATH = "glove.6B.50d.txt"
GLOVE_840B_300D_PATH = "glove.840B.300d.txt"
encoding="utf-8"

Loading the dataset into np arrays

In [2]:
X, y = [], []

train_path = os.path.join(TRAIN_SET_PATH)
df = pd.read_pickle(train_path)

all_x = df['text']
for x in all_x:
    X.append(x.split())
    
y = df['class'].values

X, y = np.array(X), np.array(y)
print ("total examples %s" % len(y))
print(X.shape)

total examples 5831
(5831,)


In [3]:
with open(GLOVE_6B_50D_PATH, "rb") as lines:
    wvec = {line.split()[0].decode(encoding): np.array(line.split()[1:],dtype=np.float32)
               for line in lines}

In [4]:
glove_small = {}
all_words = set(w for words in X for w in words)
with open(GLOVE_6B_50D_PATH, "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0].decode(encoding)
        if (word in all_words):
            nums=np.array(parts[1:], dtype=np.float32)
            glove_small[word] = nums

            
glove_big = {}
with open(GLOVE_840B_300D_PATH, "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0].decode(encoding)
        if word in all_words:
            nums=np.array(parts[1:], dtype=np.float32)
            glove_big[word] = nums

In [5]:
# train word2vec on all the texts - both training and test set
# we're not using test labels, just texts so this is fine
model = Word2Vec(X, size=100, window=5, min_count=5, workers=2)
w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.vectors)}

<class 'numpy.ndarray'>


In [6]:
print(len(all_words))

48512


In [7]:
# start with the classics - naive bayes of the multinomial and bernoulli varieties
# with either pure counts or tfidf features
mult_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])
bern_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("bernoulli nb", BernoulliNB())])
mult_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])
bern_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("bernoulli nb", BernoulliNB())])
# SVM - which is supposed to be more or less state of the art 
# http://www.cs.cornell.edu/people/tj/publications/joachims_98a.pdf
svc = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("linear svc", SVC(kernel="linear"))])
svc_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("linear svc", SVC(kernel="linear"))])

In [8]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(glove_small))])
        else:
            self.dim=0
            
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

    
# and a tf-idf version of the same
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(glove_small))])
        else:
            self.dim=0
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [9]:
# Extra Trees classifier is almost universally great, let's stack it with our embeddings
etree_glove_small = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_small)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_glove_small_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_small)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_glove_big = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_big)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_glove_big_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_big)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])

etree_w2v = Pipeline([("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_w2v_tfidf = Pipeline([("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])

In [None]:
all_models = [
    ("mult_nb", mult_nb),
    ("mult_nb_tfidf", mult_nb_tfidf),
    ("bern_nb", bern_nb),
    ("bern_nb_tfidf", bern_nb_tfidf),
    ("svm", svc),
    ("svm_tfidf", svc_tfidf),
    ("w2v", etree_w2v),
    ("w2v_tfidf", etree_w2v_tfidf),
    ("glove_small", etree_glove_small),
    ("glove_small_tfidf", etree_glove_small_tfidf),
    ("glove_big", etree_glove_big),
    ("glove_big_tfidf", etree_glove_big_tfidf),

]
 
scoring = ['precision_macro', 'recall_macro', 'accuracy']
unsorted_scores = [(name, cross_validate(model, X, y, cv=5, scoring=scoring, return_train_score=True, return_estimator=True)) \
                   for name, model in all_models]
# scores = sorted(unsorted_scores, key=lambda x: -x[1])

In [71]:

# sort table into various predictors
table = []
for classifier in unsorted_scores:
    name = classifier[0]
    classi = [name]
    scores = classifier[1]
    for score in scores:
        if score != 'estimator': 
            classi.append(scores[score].mean())
    table.append(classi)

# add f1 score using the formula 2*((test_prec*test_rec)/(test_prec+test_rec))
for classifier in table:
    name = classifier[0]
    test_prec = classifier[3]
    test_rec = classifier[5]
    test_f1 = 2*((test_prec*test_rec)/(test_prec+test_rec))
    
    classifier.append(test_f1)
    
    train_prec = classifier[4]
    train_rec = classifier[6]
    test_f1 = 2*((train_prec*train_rec)/(train_prec+train_rec))
    classifier.append(test_f1)
    
table = sorted(table, key=lambda x: -x[7])
print (tabulate(table, floatfmt=".4f", headers=("model",'fit_time', 'score_time', 'test_prec', 
                                                 'train_prec', 'test_rec', 'train_rec',
                                                 'test_acc', 'train_acc', 'test_f1', 'train_f1')))

AttributeError: 'tuple' object has no attribute 'mean'

## ROC Curve

In [134]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from scipy import interp
from itertools import cycle

classes = ['human_damage',
            'fires',
             'damaged_nature',
             'flood',
             'non_damage',
             'damaged_infrastructure']
indices = np.random.randint(0, 5831, 100)
X_test = np.array(X[indices])
y_test = np.array(y[indices])

preds = []
for classifier in unsorted_scores:
    name = classifier[0]
    estimator = classifier[1]['estimator'][0]
    y_pred = estimator.predict(X_test)
    y_pred = label_binarize(y_pred, classes=classes)
    y_test = label_binarize(y_test, classes=classes)
    preds.append((name, y_pred, y_test))
                 
num_classes = y_test.shape[1]

for j in range(len(preds)):

    name, y_score, y_test = preds[j]
    
    fpr = {}
    tpr = {}
    roc_auc = {}
    for i in range(num_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])    

    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(num_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(num_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= num_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    plt.figure()
    plt.plot(fpr["micro"], tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["micro"]),
             color='deeppink', linestyle=':', linewidth=4)

    plt.plot(fpr["macro"], tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["macro"]),
             color='navy', linestyle=':', linewidth=4)

    colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'red', 'magenta', 'yellow'])
    for i, color in zip(range(num_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color,
                 label='ROC curve of class {0} (area = {1:0.2f})'
                 ''.format(i, roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(name)
    plt.legend(loc=9, bbox_to_anchor=(1.4, 0.8), ncol=1)
    #plt.legend(loc="lower right")

    train_path = os.path.join('./roc_out', name + '.jpg')
    plt.savefig(train_path, bbox_inches='tight')
    plt.close()