In [22]:
# import
from __future__ import print_function, division
from builtins import range

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from gensim.models import KeyedVectors

In [2]:
# load data
train = pd.read_csv("data/r8-train-all-terms.txt", header=None, sep="\t")
test = pd.read_csv("data/r8-test-all-terms.txt", header=None, sep="\t")
train.columns = ["label", "content"]
test.columns = ["label", "content"]
print(train.shape)
print(test.shape)

(5485, 2)
(2189, 2)


In [3]:
train.head()

Unnamed: 0,label,content
0,earn,champion products ch approves stock split cham...
1,acq,computer terminal systems cpml completes sale ...
2,earn,cobanco inc cbco year net shr cts vs dlrs net ...
3,earn,am international inc am nd qtr jan oper shr lo...
4,earn,brown forman inc bfd th qtr net shr one dlr vs...


In [17]:
train.label.unique()

array(['earn', 'acq', 'trade', 'ship', 'grain', 'crude', 'interest',
       'money-fx'], dtype=object)

In [4]:
# glove vectorizer
class GloveVectorizer:
    
    def __init__(self):
        print("Loading word vectors")
        
        word2vec = {}
        embedding = []
        idx2word = []
        
        with open("embeddings/glove.6B.50d.txt") as f:
            for line in f:
                values = line.split()
                word = values[0]
                vec = np.asarray(values[1:], dtype=float)
                word2vec[word] = vec
                embedding.append(vec)
                idx2word.append(word)
            print("Loaded %s words" % len(word2vec))
        
        self.word2vec = word2vec
        self.embedding = np.array(embedding)
        self.word2idx = {v:k for k,v in enumerate(idx2word)}
        self.V, self.D = self.embedding.shape
        
        
    def fit(self, data):
        pass
    
    
    def transform(self, data):
        # convert sentence to average of word2vec vectors
        
        X = np.zeros((len(data), self.D))
        n = 0
        emptycount = 0
        
        for sentence in data:
            tokens = sentence.lower().split()
            vecs = []
            for word in tokens:
                if word in self.word2vec:
                    vec = self.word2vec[word]
                    vecs.append(vec)
            if len(vecs) > 0:
                vecs = np.array(vecs)
                X[n] = vecs.mean(axis=0)
            else:
                emptycount += 1
            n += 1
        print("Number of samples with no words found: %s / %s" % (emptycount, len(data)))
        return X
    
    
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

In [8]:
# w2v vectorizer
class Word2VecVectorizer:
    
    def __init__(self):
        print("Loading word vectors")
        
        self.word_vectors = KeyedVectors.load_word2vec_format("embeddings/GoogleNews-vectors-negative300.bin", 
                                                              binary=True)
        print("Finished loading word vectors")
        
        
    def fit(self, data):
        pass
    
    
    def transform(self, data):
        # convert sentence to average of word2vec vectors
        
        v = self.word_vectors.get_vector("king")
        self.D = v.shape[0]
        
        X = np.zeros((len(data), self.D))
        n = 0
        emptycount = 0
        
        for sentence in data:
            tokens = sentence.split() # no lower since w2v contains capital chars
            vecs = []
            m = 0
            for word in tokens:
                try:
                    vec = self.word_vectors.get_vector(word) # throws KeyError if not found
                    vecs.append(vec)
                    m += 1
                except KeyError:
                    pass
            if len(vecs) > 0:
                vecs = np.array(vecs)
                X[m] = vecs.mean(axis=0)
            else:
                emptycount += 1
            n += 1
        print("Number of samples with no words found: %s / %s" % (emptycount, len(data)))
        return X
    
    
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

In [10]:
# initialize vectorizer
vectorizer = Word2VecVectorizer()

Loading word vectors
Finished loading word vectors


In [11]:
# fit transform train data
Xtrain = vectorizer.fit_transform(train.content)
Ytrain = train.label

Number of samples with no words found: 0 / 5485


In [12]:
# transform test data
Xtest = vectorizer.transform(test.content)
Ytest = test.label

Number of samples with no words found: 0 / 2189


In [20]:
# model fitting
model = ExtraTreesClassifier(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(Xtrain, Ytrain)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [21]:
# evaluation
print("train score:", model.score(Xtrain, Ytrain))
print("test score:", model.score(Xtest, Ytest))

train score: 0.5544211485870556
test score: 0.4915486523526725
