# Visualizing Word Vectors with t-SNE

TSNE is pretty useful when it comes to visualizing similarity between objects. It works by taking a group of high-dimensional (100 dimensions via Word2Vec) vocabulary word feature vectors, then compresses them down to 2-dimensional x,y coordinate pairs. The idea is to keep similar words close together on the plane, while maximizing the distance between dissimilar words. 

### Steps

1. Load cleaned data
2. Build a corpus
3. Train a Word2Vec Model
4. Train XGBoost and logReg

Credit: Some of the code was inspired by this awesome [NLP repo][1]. 




  [1]: https://github.com/rouseguy/DeepLearningNLP_Py

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np
import re
#import nltk
from nltk import ngrams

from sklearn import metrics
import xgboost as xgb
from sklearn.linear_model import LogisticRegression

from gensim.models import word2vec

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams["figure.figsize"] = (16,6)

import multiprocessing as mp

%load_ext autotime

In [2]:
%%time
#data = pd.read_csv('data/train.csv').sample(50000, random_state=23)
train = pd.read_csv('../data/train_wo_sw.csv')
train = train.drop(train.columns[0], axis=1)

test = pd.read_csv('../data/test_wo_sw.csv')
test = test.drop(test.columns[0], axis=1)

CPU times: user 5.75 s, sys: 628 ms, total: 6.38 s
Wall time: 9.99 s
time: 9.99 s


In [3]:
for data in [train, test]:
    for col in ['question1', 'question2']:
        data[col][pd.isnull(data[col])] = ''
del data

time: 569 ms


In [4]:
assert 2345796 == test.shape[0]
assert 404290 == train.shape[0]

time: 1.91 ms


In [5]:
print(train.shape, test.shape)
train.head(3)
train.tail(3)

(404290, 6) (2345796, 3)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
404287,404287,537928,537929,one coin,whats coin,0
404288,404288,537930,537931,approx annual cost living studying uic chicago...,little hairfall problem use hair styling produ...,0
404289,404289,537932,537933,like sex cousin,like sex cousin,0


time: 19.8 ms


In [6]:
test.head(3)

Unnamed: 0,test_id,question1,question2
0,0,surface pro 4 compare ipad pro,microsoft choose core m3 core i3 home surface ...
1,1,hair transplant age 24 much would cost,much cost hair transplant require
2,2,best way send money china us,send money china


time: 9 ms


In [None]:
#%%time
def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    for col in ['question1', 'question2']:
        for sentence in data[col].iteritems():
            try:
                word_list = sentence[1].split()
            except:
                print(col, sentence)
                raise
            corpus.append(word_list)
            
    return corpus

corpus = build_corpus(pd.concat([train, test]))       
corpus[0:2]

In [None]:
assert len(corpus) == (train.shape[0] + test.shape[0])*2

# Word 2 Vec

The Word to Vec model produces a vocabulary, with each word being represented by an n-dimensional numpy array (100 values in this example)

In [None]:
model_w2v = word2vec.Word2Vec(corpus, size=100, window=20, min_count=1, workers=4)
del corpus

In [7]:
#model_w2v.save("../data/word2vec.model")
model_w2v = word2vec.Word2Vec.load("../data/word2vec.model")

time: 3.69 s


In [8]:
model_w2v.corpus_count

5500172

time: 2.64 ms


In [9]:
def auc_plot(y_true, y_pred):
    loss = metrics.log_loss(y_true, y_pred)
    fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred)
    roc_auc=metrics.auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2,
             label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.legend(loc="lower right")
    plt.show()
    return loss, roc_auc 

time: 5.21 ms


In [10]:
%%time
def calc_cosine_similir(q1, q2):
    if len(q1) == 0 or len(q2) ==0:
        return 0
    v1 = np.sum([model_w2v.wv[w] for w in q1], axis = 0)
    v2 = np.sum([model_w2v.wv[w] for w in q2], axis = 0)
    res = np.dot(v1, v2) / ( np.sqrt(np.dot(v1, v1)) * np.sqrt(np.dot(v2, v2)) )
    if type(res) != np.dtype('float32'):
        print(type(res))
        print(q1,q2,v1,v2)
        raise
    return res


def calc_counters_from_list(l):
    n = len(l)
    if n == 0:
        return [0.0] * 10
    l = [np.sum(l),
         np.min(l),
         np.max(l),
         np.median(l),
         np.average(l)]
    return l + [x/n for x in l]
    
    
def calc_w2v_similarity(row):
    q1 = row['question1'].split() 
    q2 = row['question2'].split()
    cosine_similir = calc_cosine_similir(q1, q2)
    q1_uniq = list(set(q1) - set(q2)) 
    q2_uniq = list(set(q2) - set(q1))
    words_simil = []
    for w1 in q1_uniq:    
        for w2 in q2_uniq:
            s = model_w2v.similarity(w1, w2)
            words_simil.append(s)
    feat_count = calc_counters_from_list(words_simil)
    return [cosine_similir] + calc_counters_from_list(words_simil)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 10 µs
time: 47.4 ms


In [11]:
def split_df_to_pools(f, data):
    if not callable(f):
        raise
    try:
        p = mp.Pool(processes=4)
        split = np.array_split(data, 8)
        res = p.map(f, split)
    except:
        raise
    finally:
        p.close()
    return res

time: 3.65 ms


In [12]:
def f_woker(x):
    return np.vstack(x.apply(lambda row: calc_w2v_similarity(row), axis=1))
# X_w2v = np.vstack(f_woker(train))
X_w2v = np.vstack(split_df_to_pools(f_woker, train)) 


time: 2min 31s


In [13]:
X_w2v_test = np.vstack(split_df_to_pools(f_woker, test))

time: 17min 10s


In [14]:
def calc_common_len_ratio(list1, list2):
    """Calculate number of common elements in lists.
    Return list with number and ratio.
    """
    common_unigrams_len = len( set(list1).intersection(set(list2)) )
    common_unigrams_ratio = float(common_unigrams_len) / max(len( set(list1).union(set(list2)) ),1)
    return [common_unigrams_len, common_unigrams_ratio]


def feature_extraction(row):
    unigrams_que1 = row['question1'].split() 
    unigrams_que2 = row['question2'].split()
    out_list = calc_common_len_ratio(unigrams_que1, unigrams_que2)
    
    # get bigram and trigram features #
    for ngr_numb in [2,3]:
        q1_ngram = [i for i in ngrams(unigrams_que1, ngr_numb)]
        q2_ngram = [i for i in ngrams(unigrams_que2, ngr_numb)]
        out_list = out_list + calc_common_len_ratio(q1_ngram, q2_ngram)
    return out_list

time: 9.34 ms


In [20]:
def f_woker(x):
    return np.array(x.apply(lambda row: feature_extraction(row), axis=1, raw=True))
X_common = np.vstack(split_df_to_pools(f_woker, train))
X_common_test = np.vstack(f_woker(test))



time: 1min 28s


In [21]:
y = train.is_duplicate
X = np.c_[X_common, X_w2v]; X_test = np.c_[X_common_test, X_w2v_test]
X.shape, X_test.shape

((404290, 17), (2345796, 17))

time: 393 ms


# Rebalancing the Data

However, before I do this, I would like to rebalance the data that XGBoost receives, since we have 37% positive class in our training data, and only 17% in the test data. By re-balancing the data so our training set has 17% positives, we can ensure that XGBoost outputs probabilities that will better match the data on the leaderboard, and should get a better score (since LogLoss looks at the probabilities themselves and not just the order of the predictions like AUC)

In [24]:
np.random.seed(42)
neg_count = np.sum(y == 0)
neg_ind_add = np.random.choice(a=neg_count, size=450000, replace=True)
X_bal = np.concatenate((X, X[y==0][neg_ind_add,:]), axis=0) 
y_bal = np.concatenate((y, np.zeros(len(neg_ind_add))))

time: 205 ms


# Cross validation

In [32]:
params = {}
params["objective"] = "binary:logistic"
params['eval_metric'] = 'logloss'
params["eta"] = 0.01
params["subsample"] = 0.7
params["min_child_weight"] = 25
params["colsample_bytree"] = 0.7
params["max_depth"] = 6
params["silent"] = 1
params["seed"] = 42
params['alpha'] = 0.1
num_rounds = 800 
plst = list(params.items())

time: 5.28 ms


In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, random_state=42, shuffle=True)
xgtrain = xgb.DMatrix(X, label=y)
xgbcv = xgb.cv(plst, dtrain=xgtrain, num_boost_round=num_rounds, folds=kf, seed=42)

In [None]:
plt.figure(figsize=(10,10))
plt.plot(xgbcv.iloc[100:,2], label='train')
plt.plot(xgbcv.iloc[100:,0], label='test')
plt.legend(loc=0)

In [None]:
t = xgbcv.iloc[400:,:]
plt.figure(figsize=(10,10))
plt.errorbar(x=400+np.arange(len(t)), y=t.iloc[:,0], yerr=t.iloc[:,1])
plt.errorbar(x=400+np.arange(len(t)), y=t.iloc[:,2], yerr=t.iloc[:,3])

In [None]:
len(xgbcv)

In [None]:
plt.figure(figsize=(10,10))
plt.plot(xgbcv.iloc[100:,2], label='train')
plt.plot(xgbcv.iloc[100:,0], label='test')
plt.legend(loc=0)

# Submit XGBoost cos similar

In [None]:
xgtrain = xgb.DMatrix(X_bal, label=y_bal)
xgtest = xgb.DMatrix(X_test)
model = xgb.train(plst, xgtrain, num_rounds, verbose_eval=50)

time: 14min 7s


In [None]:
xgtest = xgb.DMatrix(X_test)
y_pred = model.predict(xgtest)

time: 1min 24s


In [None]:
#fname = '../submit/submit_w2v_common_balance.csv'
pd.DataFrame({'test_id':test.test_id, 'is_duplicate':y_pred}
            ).to_csv('../submit/submit_w2v_common_balance.csv', index=False)
!zip submit_w2v_common_balance.csv.zip submit_w2v_common_balance.csv 


zip error: Nothing to do! (submit_w2v_common_balance.csv.zip)
time: 8.21 s


In [None]:
#!zip 1submit_w2v_common_balance.csv.zip submit_w2v_common_balance.csv 

In [None]:
from sklearn.model_selection import train_test_split
y = train.is_duplicate
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
clf = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=0.5,
                   fit_intercept=True, intercept_scaling=1, class_weight=None,
                   random_state=42, solver='liblinear', max_iter=100,
                   multi_class='ovr', verbose=0, warm_start=False, n_jobs=1)
clf.fit(X_train, y_train)
y_pred_log = clf.predict_proba(X_test)[:,1]

In [None]:
auc_plot(y_test, y_pred)

In [None]:
params = {}
params["objective"] = "binary:logistic"
params['eval_metric'] = 'logloss'
params["eta"] = 0.02
params["subsample"] = 0.7
params["min_child_weight"] = 25
params["colsample_bytree"] = 0.7
params["max_depth"] = 4
params["silent"] = 1
params["seed"] = 42
num_rounds = 1000 
plst = list(params.items())
xgtrain = xgb.DMatrix(X_train, label=y_train)
xgtest = xgb.DMatrix(X_test, label=y_test)
watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
model = xgb.train(plst, xgtrain, num_rounds, watchlist,
                  early_stopping_rounds=25, verbose_eval=50)
y_pred = model.predict(xgtest)
auc_plot(y_test, y_pred)

In [None]:
y_pred = model.predict(xgtest)
auc_plot(y_test, y_pred)

In [None]:
xgb.plot_importance(model,  title=coef_names)

In [None]:
auc_plot(y_test, (y_pred + y_pred_log)/2)

In [None]:
y_pred.shape

In [None]:
X_test.shape

In [None]:
test.shape

In [None]:
t1 = pd.read_csv('../data/test.csv')

In [None]:
t1.shape[0] - test.shape[0]