In [25]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
from datetime import datetime
import cPickle

from gensim.models.word2vec import Word2Vec
# Any results you write to the current directory are saved as output.

In [26]:
data_dir = '../data/'
pretrained_dir = '/home/avsolatorio/WORK/kaggle/pre-trained-models/'

EMBEDDING_FILE_GLOVE = os.path.join(pretrained_dir, 'glove.840B.300d.txt')  # 'crawl-300d-2M.vec')
EMBEDDING_FILE_GLOVE_W2V = os.path.join(pretrained_dir, 'glove.840B.300d.w2vformat.txt')
EMBEDDING_FILE_FASTTEXT = os.path.join(pretrained_dir, 'crawl-300d-2M.vec')

train = pd.read_csv(os.path.join(data_dir, 'train.csv'))
test = pd.read_csv(os.path.join(data_dir, 'test.csv'))

In [27]:
def tokenize(text, return_splits=True):
    text = text.fillna('fillna')
    text = text.str.lower()
    text = text.str.strip()
    
    # Datetime formats
    text = text.str.replace('[0-9]{2}:[0-9]{2}, [a-z]+ [0-9]{1,}, [0-9]{4} \(utc\)', ' ')
    text = text.str.replace('[0-9]{2}:[0-9]{2}, [0-9]{1,} [a-z]+ [0-9]{1,} \(utc\)', ' ')
    
    # Web links
    text = text.str.replace('http[s]{0,1}://[\S]+ ', ' ')

    text = text.str.replace("\'", '')
    text = text.str.replace("!{1,}", ' xexclamx ')
    text = text.str.replace('[\(\)]', '')
    text = text.str.replace(',', ' ')
    text = text.str.replace('[\!\@\#\$\%\^\&\*\.\?]{2,}', ' xrepsx ')
    text = text.str.replace('[\n]{1,}', ' ')
    text = text.str.replace('[^a-zA-Z ]', ' ')
    
    if return_splits:
        text = text.str.split()

    return text

In [20]:
def toline(word, w2v_model):
    return '{} {}'.format(word, ' '.join(w2v_model[word].astype('str')))


def train_word2vec(
    tokenized_questions,
    pre_trained_model='GoogleNews-vectors-negative300.bin.gz',
    size=300,
    iter=10,
    min_count=2,
    negative=10,
    workers=7,
    min_alpha=0.0001,
    window=5,
    binary=True,
    save_file=None,
    seed=1029,
    lockf=1
):
    # https://github.com/RaRe-Technologies/gensim/issues/1245
    # List of tokenized questions.
    # e.g. ['What', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in', 'india']
    # pre_trained_model can be any pre trained model that gensim accepts, e.g., Glove or GoogleNews word2vec

    # Initialize model
    word_vectors = Word2Vec(
        size=size, iter=iter, min_count=min_count, negative=negative, workers=workers,
        min_alpha=min_alpha, window=window,
        seed=seed
    )

    # Initialize vocab
    print('building vocab...')
    word_vectors.build_vocab(tokenized_questions)

    # Initialize vectors in local model with with vectors from pre-trained model with overlapping vocabulary.
    # Set `lockf` to 1 for re-training
    print('injecting pre-trained vectors...')
    word_vectors.intersect_word2vec_format(pre_trained_model, lockf=lockf, binary=binary)

    # Adjust pre-trained vectors to adapt its distribution with that of the local data via retraining.
    print('start training...')
    word_vectors.train(
        tokenized_questions,
        total_examples=word_vectors.corpus_count,
        epochs=word_vectors.iter
    )
    
    try:
        save_file = '{}-{}'.format(save_file, datetime.now()) if save_file is not None else 'custom-w2v-{}'.format(datetime.now())
        
        print('saving model to file: {}...'.format(save_file))
        word_vectors.save('{}.w2vmodel'.format(save_file))
        
        print('saving vectors to file: {}...'.format())
        with open('{}.w2v.vectors.txt'.format(save_file), 'w') as f:
            for word in word_vectors.wv.vocab:
                entry = toline(word, word_vectors)
                f.write('{}\n'.format(entry))
    except:
        print('Failed saving...')
        pass

    return word_vectors, save_file

In [5]:
%%time
X_train = tokenize(train["comment_text"], return_splits=True).values
X_test = tokenize(test["comment_text"], return_splits=True).values

CPU times: user 11.7 s, sys: 266 ms, total: 12 s
Wall time: 11.9 s


In [6]:
tokenized_comments = [i for i in X_train if len(i) > 0] + [i for i in X_test if len(i) > 0]

In [21]:
%%time
start = datetime.now()
word_vectors_vec, save_file = train_word2vec(
    tokenized_comments,
    pre_trained_model=EMBEDDING_FILE_FASTTEXT,
    binary=False,
    save_file='custom-w2v-with-fasttext-vecs-locked',
    iter=10,
    lockf=0,
    min_count=10
)

with open('word_vectors_fasttext.lock', 'w') as fl:
    fl.write('{}'.format((datetime.now() - start).total_seconds()))

building vocab...
injecting pre-trained vectors...
start training...
saving model to file: custom-w2v-with-fasttext-vecs-locked-2018-03-18 19:57:58.436226...
Failed saving...
CPU times: user 21min, sys: 2.87 s, total: 21min 2s
Wall time: 7min 53s


In [23]:
print word_vectors_vec.most_similar('fuck')

[('fucking', 0.8198376893997192), ('fucks', 0.7554321885108948), ('fucked', 0.751573920249939), ('fucker', 0.7173537611961365), ('fuckin', 0.698074996471405), ('fuckers', 0.690087616443634), ('shit', 0.6870328783988953), ('fuk', 0.6845501661300659), ('ashol', 0.6474372148513794), ('fucken', 0.6414185166358948)]


In [24]:
with open('{}.w2v.vectors.txt'.format(save_file), 'w') as f:
    for word in word_vectors_vec.wv.vocab:
        entry = toline(word, word_vectors_vec)
        f.write('{}\n'.format(entry))

word_vectors_vec.save('{}.w2vmodel'.format(save_file))

In [16]:
print word_vectors_glove_vec.most_similar('xexclamx')

[('thank', 0.4745348393917084), ('lol', 0.4688342809677124), ('congrats', 0.45453837513923645), ('haha', 0.45355671644210815), ('yay', 0.45155268907546997), ('hello', 0.45108360052108765), ('freakin', 0.4448145031929016), ('hahaha', 0.44444596767425537), ('awesome', 0.44306883215904236), ('happy', 0.4410898685455322)]


In [13]:
print word_vectors_glove_vec.most_similar('xexclamx')

[('thank', 0.492216557264328), ('happy', 0.4738181233406067), ('lol', 0.47110986709594727), ('awesome', 0.47050565481185913), ('haha', 0.45867347717285156), ('hello', 0.45843008160591125), ('yay', 0.45154261589050293), ('congrats', 0.45094528794288635), ('oh', 0.4467509090900421), ('freakin', 0.44231390953063965)]


In [12]:
print word_vectors_glove_vec.most_similar('fuck')

[('fucking', 0.9117226004600525), ('ass', 0.8491613268852234), ('fucked', 0.8409990072250366), ('bitch', 0.8190275430679321), ('dick', 0.8054120540618896), ('pussy', 0.800691545009613), ('suck', 0.796139657497406), ('fucks', 0.7924858331680298), ('fucker', 0.779781699180603), ('fuckin', 0.7665546536445618)]


In [17]:
save_file = 'custom-w2v-with-glove-vecs-locked-2018-03-18 18:57:06.486222'
with open('{}.w2v.vectors.txt'.format(save_file), 'w') as f:
    for word in word_vectors_glove_vec.wv.vocab:
        entry = toline(word, word_vectors_glove_vec)
        f.write('{}\n'.format(entry))

word_vectors_glove_vec.save('{}.w2vmodel'.format(save_file))

In [26]:
word_vectors_glove.most_similar('putah')

[('yawa', 0.8797650933265686),
 ('nalang', 0.8796565532684326),
 ('hayop', 0.8790163993835449),
 ('baboy', 0.8676713705062866),
 ('offff', 0.8663419485092163),
 ('lanwi', 0.864619255065918),
 ('bacha', 0.8645554780960083),
 ('blahhhhh', 0.8591206073760986),
 ('poljske', 0.8590113520622253),
 ('kirchen', 0.8589603304862976)]

In [28]:
save_file = 'custom-w2v-2018-03-18 01:51:37.402106'
with open('{}.w2v.vectors.txt'.format(save_file), 'w') as f:
    for word in word_vectors_glove.wv.vocab:
        entry = toline(word, word_vectors_glove)
        f.write('{}\n'.format(entry))

In [29]:
word_vectors_glove.save('{}.w2vmodel'.format(save_file))

[('kamo', 0.33765679597854614),
 ('hayop', 0.33685269951820374),
 ('nawa', 0.3262745440006256),
 ('mumu', 0.32459402084350586),
 ('tubig', 0.31144294142723083),
 ('ihh', 0.29867127537727356),
 ('kuwa', 0.29780203104019165),
 ('oooooooh', 0.29376742243766785),
 ('saare', 0.2930316627025604),
 ('celaka', 0.2916872799396515)]

'fuck 2.017783 1.7786797 0.89188445 0.33717886 -1.0747464 -1.3682405 0.24001393 2.106605 -0.6743231 0.38741764 -2.5176349 1.5692208 1.2137522 -1.0351607 0.16773722 -0.65743643 -1.342997 1.3867356 -0.23306178 -0.5098565 0.94252956 1.0163538 1.5931997 -0.44192076 0.13647778 -0.4049346 -0.5950908 0.65403336 -0.37482566 0.057701822 0.18711543 -0.9583984 -0.959869 -0.81852007 0.6665856 0.18213916 -1.2549973 0.7778149 0.673082 -1.4108119 -0.8555216 0.52855974 -1.8154352 1.0716504 -1.1520269 1.550046 -0.10887948 -1.9119035 -1.3034973 0.7408836 2.1605446 -1.104308 -0.42523685 -0.87536865 -0.38826787 0.37005678 0.029779159 -0.28320637 0.5889931 0.88086516 -0.15481481 -0.52914613 0.79724354 -0.23508072 0.21692257 0.96935815 -0.98359966 -1.5806046 -0.44355845 -0.08708171 0.019584794 0.6108139 -0.036803618 1.0005672 -0.67962605 1.6407994 0.43595326 -0.8629202 0.5020631 2.167295 0.764724 -0.96828157 0.22215915 -0.49856928 -0.09789388 -1.9419558 0.65986735 0.76987594 0.09665694 -1.4868082 0.13182876

In [None]:
def toline(word, w2v_model):
    return '{} {}'.format(word, ' '.join(w2v_model[word].astype('str')))


def train_word2vec(
    tokenized_docs,
    pre_trained_model='GoogleNews-vectors-negative300.bin.gz',
    size=300,
    iter=10,
    min_count=2,
    negative=10,
    workers=7,
    min_alpha=0.0001,
    window=5,
    binary=True,
    save_file=None,
    seed=1029,
    lockf=1
):
    # https://github.com/RaRe-Technologies/gensim/issues/1245
    # List of tokenized questions.
    # e.g. ['What', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in', 'india']
    # pre_trained_model can be any pre trained model that gensim accepts, e.g., Glove or GoogleNews word2vec

    # Initialize model
    word_vectors = Word2Vec(
        size=size, iter=iter, min_count=min_count, negative=negative, workers=workers,
        min_alpha=min_alpha, window=window,
        seed=seed
    )

    # Initialize vocab
    print('building vocab...')
    word_vectors.build_vocab(tokenized_docs)

    # Initialize vectors in local model with with vectors from pre-trained model with overlapping vocabulary.
    # Set `lockf` to 1 for re-training
    print('injecting pre-trained vectors...')
    word_vectors.intersect_word2vec_format(pre_trained_model, lockf=lockf, binary=binary)

    # Adjust pre-trained vectors to adapt its distribution with that of the local data via retraining.
    print('start training...')
    word_vectors.train(
        tokenized_docs,
        total_examples=word_vectors.corpus_count,
        epochs=word_vectors.iter
    )
    
    try:
        save_file = '{}-{}'.format(save_file, datetime.now()) if save_file is not None else 'custom-w2v-{}'.format(datetime.now())
        
        print('saving model to file: {}...'.format(save_file))
        word_vectors.save('{}.w2vmodel'.format(save_file))
        
        print('saving vectors to file: {}...'.format())
        with open('{}.w2v.vectors.txt'.format(save_file), 'w') as f:
            for word in word_vectors.wv.vocab:
                entry = toline(word, word_vectors)
                f.write('{}\n'.format(entry))
    except:
        print('Failed saving...')
        pass

    return word_vectors, save_file