In [None]:
%reload_ext autoreload
%autoreload 2

from rnn_classifier import model
import pandas as pd
from gensim.models.word2vec import Word2Vec

In [None]:
TRAIN_SET_PATH = "../../cleaned_data/cleaned_text.pkl"

In [None]:
hyperparams = {
    rnn_type: 'LSTM', # 'LSTM', 'GRU'
    embedding_size: 300,
    num_hidden_units: 500,
    num_layers: 2,
    init_lr: 1e-3,
    grad_clipping: 5,
    num_epochs: 10,
    batch_size: 32,
    dropout_rate: 0,
    is_bidirectional: True
}

Loading the dataset into np arrays

In [None]:
X, y = [], []

train_path = os.path.join(TRAIN_SET_PATH)
df = pd.read_pickle(train_path)

all_x = df['text']
for x in all_x:
    X.append(x.split())
    
y = df['class'].values

X, y = np.array(X), np.array(y)
print ("total examples %s" % len(y))
print(X.shape)

In [None]:
with open(GLOVE_6B_50D_PATH, "rb") as lines:
    wvec = {line.split()[0].decode(encoding): np.array(line.split()[1:],dtype=np.float32)
               for line in lines}

In [None]:
glove_small = {}
all_words = set(w for words in X for w in words)
with open(GLOVE_6B_50D_PATH, "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0].decode(encoding)
        if (word in all_words):
            nums=np.array(parts[1:], dtype=np.float32)
            glove_small[word] = nums

            
glove_big = {}
with open(GLOVE_840B_300D_PATH, "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0].decode(encoding)
        if word in all_words:
            nums=np.array(parts[1:], dtype=np.float32)
            glove_big[word] = nums

In [None]:
# train word2vec on all the texts - both training and test set
# we're not using test labels, just texts so this is fine
model = Word2Vec(X, size=100, window=5, min_count=5, workers=2)
w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.vectors)}

In [None]:
print(len(all_words))

In [None]:
# start with the classics - naive bayes of the multinomial and bernoulli varieties
# with either pure counts or tfidf features
mult_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])
bern_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("bernoulli nb", BernoulliNB())])
mult_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])
bern_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("bernoulli nb", BernoulliNB())])
# SVM - which is supposed to be more or less state of the art 
# http://www.cs.cornell.edu/people/tj/publications/joachims_98a.pdf
svc = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("linear svc", SVC(kernel="linear"))])
svc_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("linear svc", SVC(kernel="linear"))])

In [None]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(glove_small))])
        else:
            self.dim=0
            
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

    
# and a tf-idf version of the same
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(glove_small))])
        else:
            self.dim=0
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [None]:
# Extra Trees classifier is almost universally great, let's stack it with our embeddings
etree_glove_small = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_small)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_glove_small_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_small)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_glove_big = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_big)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_glove_big_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_big)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])

etree_w2v = Pipeline([("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_w2v_tfidf = Pipeline([("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])

In [None]:
all_models = [
    ("mult_nb", mult_nb),
    ("mult_nb_tfidf", mult_nb_tfidf),
    ("bern_nb", bern_nb),
    ("bern_nb_tfidf", bern_nb_tfidf),
    ("svm", svc),
    ("svm_tfidf", svc_tfidf),
    ("w2v", etree_w2v),
    ("w2v_tfidf", etree_w2v_tfidf),
    ("glove_small", etree_glove_small),
    ("glove_small_tfidf", etree_glove_small_tfidf),
    ("glove_big", etree_glove_big),
    ("glove_big_tfidf", etree_glove_big_tfidf),

]
 
scoring = ['precision_macro', 'recall_macro', 'accuracy']
unsorted_scores = [(name, cross_validate(model, X, y, cv=5, scoring=scoring, return_train_score=True, return_estimator=True)) \
                   for name, model in all_models]
# scores = sorted(unsorted_scores, key=lambda x: -x[1])

In [None]:
# sort table into various predictors
table = []
for classifier in unsorted_scores:
    name = classifier[0]
    classi = [name]
    scores = classifier[1]
    for score in scores:
        if score != 'estimator': 
            classi.append(scores[score].mean())
    table.append(classi)

# add f1 score using the formula 2*((test_prec*test_rec)/(test_prec+test_rec))
for classifier in table:
    name = classifier[0]
    test_prec = classifier[3]
    test_rec = classifier[5]
    test_f1 = 2*((test_prec*test_rec)/(test_prec+test_rec))
    
    classifier.append(test_f1)
    
    train_prec = classifier[4]
    train_rec = classifier[6]
    test_f1 = 2*((train_prec*train_rec)/(train_prec+train_rec))
    classifier.append(test_f1)
    
table = sorted(table, key=lambda x: -x[7])
print (tabulate(table, floatfmt=".4f", headers=("model",'fit_time', 'score_time', 'test_prec', 
                                                 'train_prec', 'test_rec', 'train_rec',
                                                 'test_acc', 'train_acc', 'test_f1', 'train_f1')))

In [None]:
import os
cwd = os.getcwd()

In [None]:
import pandas as pd

In [9]:
newfile = '/Users/swetharevanur/Documents/3_Junior/1_Fall/cs229/cs229-final-project/intermediates/cleaned_text_total.pkl'

In [10]:
df = pd.read_pickle(newfile)

# import _pickle as cPickle
# with open(newfile, 'rb') as fo:
#     dict_test = cPickle.load(fo, encoding='latin1')

In [11]:
print(df)

                                      file_name  \
0            isiscrimes_2015-08-23_21-07-05.txt   
1        syriawarcrimes_2017-06-19_06-24-25.txt   
2            isiscrimes_2015-08-27_00-07-30.txt   
3            isiscrimes_2015-08-11_01-05-40.txt   
4           yemencrisis_2015-11-27_01-40-28.txt   
5            isiscrimes_2015-08-03_11-18-40.txt   
6            isiscrimes_2015-04-26_00-37-04.txt   
7          terrorattack_2017-10-02_13-13-23.txt   
8            isiscrimes_2015-08-01_10-54-18.txt   
9          victimsofwar_2016-12-17_02-12-27.txt   
10           isiscrimes_2015-08-01_10-57-48.txt   
11           isiscrimes_2015-08-14_23-22-03.txt   
12           isiscrimes_2015-08-16_22-00-33.txt   
13           isiscrimes_2015-08-13_05-42-23.txt   
14         victimsofwar_2017-03-07_19-29-35.txt   
15          yemencrisis_2015-07-20_17-18-00.txt   
16           isiscrimes_2015-12-09_21-02-34.txt   
17           isiscrimes_2015-08-05_19-58-42.txt   
18           isiscrimes_2015-12