In [12]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [13]:
train = pd.read_csv('./preprocessing/train.csv')
test = pd.read_csv('./preprocessing/test.csv')

In [14]:
from gensim.models import Word2Vec, KeyedVectors
import nltk

In [15]:
df = pd.concat([train,test])

In [16]:
corpus = df['text'].values
Corpus_list = [nltk.word_tokenize(title) for title in corpus]

In [17]:
model = Word2Vec(Corpus_list,min_count=1,vector_size = 100)

In [18]:
model.wv.most_similar('death')

[('us', 0.9997537732124329),
 ('life', 0.9997451305389404),
 ('love', 0.9997369050979614),
 ('move', 0.9997173547744751),
 ('know', 0.9997155070304871),
 ('good', 0.9997109770774841),
 ('leave', 0.9997085928916931),
 ('collide', 0.9997062087059021),
 ('w', 0.9997045993804932),
 ('say', 0.9997004270553589)]

In [19]:
path = "../Word2Vec/GoogleNews-vectors-negative300.bin"

In [20]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format(path,binary=True)

In [21]:
class MeanEmbeddingVectorizer(object):

    def __init__(self, word_model):
        self.word_model = word_model
        self.vector_size = word_model.vector_size

    def fit(self):  # comply with scikit-learn transformer requirement
        return self

    def transform(self, docs):  # comply with scikit-learn transformer requirement
        doc_word_vector = self.word_average_list(docs)
        return doc_word_vector

    def word_average(self, sent):
        """
        Compute average word vector for a single doc/sentence.


        :param sent: list of sentence tokens
        :return:
            mean: float of averaging word vectors
        """
        mean = []
        for word in sent:
            if word in self.word_model.index_to_key:
                mean.append(self.word_model.get_vector(word))

        if not mean:  # empty words
            # If a text is empty, return a vector of zeros.
            #logging.warning("cannot compute average owing to no vector for {}".format(sent))
            return np.zeros(self.vector_size)
        else:
            mean = np.array(mean).mean(axis=0)
            return mean


    def word_average_list(self, docs):
        """
        Compute average word vector for multiple docs, where docs had been tokenized.

        :param docs: list of sentence in list of separated tokens
        :return:
            array of average word vector in shape (len(docs),)
        """
        return np.vstack([self.word_average(sent) for sent in docs])

In [22]:
from sklearn import feature_extraction
from collections import defaultdict

class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(word2vec.itervalues().next())

    def fit(self, X, y):
        tfidf = feature_extraction.text.TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [23]:
mean_vec_tr = MeanEmbeddingVectorizer(model)
doc_vec = mean_vec_tr.transform(Corpus_list)

In [24]:
print('Shape of word-mean doc2vec...')
display(doc_vec.shape)

Shape of word-mean doc2vec...


(10695, 300)

In [25]:
Corpus_train = train['text'].values

In [26]:
train_corpus = [nltk.word_tokenize(title) for title in Corpus_train]
doc_vec_1 = mean_vec_tr.transform(train_corpus)

In [27]:
Corpus_test = test['text'].values
test_corpus = [nltk.word_tokenize(title) for title in Corpus_test]
doc_vec_2 = mean_vec_tr.transform(test_corpus)
print('Shape of word-mean doc2vec...')
display(doc_vec_2.shape)

Shape of word-mean doc2vec...


(3263, 300)

In [28]:
X = doc_vec_1
y = train['target']

In [29]:
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline

In [30]:
from sklearn.model_selection import StratifiedKFold

In [34]:
clf = XGBClassifier(colsample_bytree=0.7, learning_rate= 0.05, max_depth= 8,
                    min_child_weight=11, missing= -999, n_estimators= 1000,
                    nthread= 4, objective='binary:logistic', seed=1337, subsample=0.8)
scores = model_selection.cross_val_score(clf,X,y, cv=5, scoring="f1")
scores

array([0.69362084, 0.68074324, 0.7296875 , 0.70785525, 0.76837061])

In [35]:
clf.fit(X,y)

X_vec_test = doc_vec_2
y_pred = clf.predict(X_vec_test)

In [36]:
from sklearn.metrics import accuracy_score

ans = pd.read_csv('./dataset/ans.csv')['target'].values
accuracy_score(y_pred= y_pred, y_true= ans)

0.7977321483297579