# Word Embeddings Feature Generation
A word embedding is an approach to provide a dense vector representation of words that capture something about their meaning

Word embeddings are an improvement over simpler bag-of-word model word encoding schemes like word counts and frequencies that result in large and sparse vectors (mostly 0 values) that describe documents but not the meaning of the words.


Word embeddings work by using an algorithm to train a set of fixed-length dense and continuous-valued vectors based on a large corpus of text. Each word is represented by a point in the embedding space and these points are learned and moved around based on the words that surround the target word.

Gensim is an open source Python library for natural language processing, with a focus on topic modeling

* size: (default 100) The number of dimensions of the embedding, e.g. the length of the dense vector to represent each token (word).
* window: (default 5) The maximum distance between a target word and words around the target word.
* min_count: (default 5) The minimum count of words to consider when training the model; words with an occurrence less than this count will be ignored.
* workers: (default 3) The number of threads to use while training.
* sg: (default 0 or CBOW) The training algorithm, either CBOW (0) or skip gram (1

In [66]:
import gensim

In [67]:
from gensim.models import Word2Vec

In [68]:
from gensim.models import KeyedVectors

In [2]:
import pandas as pd
import os

In [70]:
#load model
model = Word2Vec.load(".\output\gensim-model.cpkt")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [71]:
# summarize vocabulary
words = list(model.wv.vocab)

In [72]:
words

['opt257',
 'opt465',
 'opt302',
 'opt527',
 'opt495',
 'opt062',
 'opt082',
 'opt137',
 'opt430',
 'opt101',
 'opt256',
 'opt494',
 'opt460',
 'opt421',
 'opt087',
 'opt456',
 'opt084',
 'opt104',
 'opt064',
 'opt252',
 'opt454',
 'opt126',
 'opt551',
 'opt319',
 'opt017',
 'opt294',
 'opt066',
 'opt509',
 'opt412',
 'opt253',
 'opt417',
 'opt526',
 'opt283',
 'opt496',
 'opt506',
 'opt345',
 'opt221',
 'opt284',
 'opt290',
 'opt498',
 'opt089',
 'opt230',
 'opt285',
 'opt293',
 'opt179',
 'opt119',
 'opt226',
 'opt258',
 'opt090',
 'opt482',
 'opt565',
 'opt174',
 'opt418',
 'opt292',
 'opt314',
 'opt235',
 'opt411',
 'opt317',
 'opt014',
 'opt071',
 'opt358',
 'opt006',
 'opt228',
 'opt110',
 'opt357',
 'opt171',
 'opt271',
 'opt301',
 'opt269',
 'opt135',
 'opt270',
 'opt451',
 'opt182',
 'opt134',
 'opt480',
 'opt497',
 'opt057',
 'opt096',
 'opt473',
 'opt508',
 'opt373',
 'opt555',
 'opt296',
 'opt458',
 'opt514',
 'opt424',
 'opt183',
 'opt167',
 'opt354',
 'opt117',
 'opt547',

In [73]:
model['opt126']

  """Entry point for launching an IPython kernel.


array([-0.9274136 ,  1.317468  , -1.6563045 , -0.5483727 , -1.4600893 ,
       -1.0040404 ,  1.2822993 , -1.481913  ,  2.7567215 , -3.1172345 ,
        1.257534  ,  0.33153495, -1.2602504 ,  0.47991106,  0.22910698,
        0.14204864,  1.0295235 , -0.2719335 , -2.1930254 , -0.44780138,
       -0.90578085,  2.2894435 , -1.301885  , -0.46979505,  1.9578679 ,
       -0.93601567,  0.6260457 , -0.2543113 , -0.10907855, -0.36471966,
        0.5238939 ,  0.10757346,  0.41783625,  0.05685801, -0.09551506,
        0.6155406 , -0.9664105 ,  0.35547873, -0.48600665, -0.7735392 ,
       -0.1511096 , -0.31706473, -0.9832288 , -0.47510886, -0.6211912 ,
        0.25528842,  2.128944  , -0.69440484, -0.71342766, -0.03911396],
      dtype=float32)

In [3]:
#load test data

df=pd.read_csv('..\jrn_universe_201909wk1_flatsamp.csv', encoding='utf-8')

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
pd.set_option('display.max_colwidth', -1)

In [13]:
df[df['attrited']==0].sample(50)

Unnamed: 0,OCIF_id,event_sequence,attrited,count_sequence
879000,819183488083800,mob024 mob043 mob109 hub001 mob002 mob085 mob080 mob070 mob081 mob025 mob027 mob025 mob044 mob109 hub002 trn144 hub001 mob085 mob025 mob109 hub002 mob024 mob043 mob024 mob043 mob109 mob002 mob080 mob070 mob081 mob025 mob027 mob044 mob024 mob043 mob109 hub001 mob002 mob085 mob080 mob070 mob027 mob081 mob044 mob109 hub002 mob024 mob043 trn144 mob024 mob043 mob109 hub001 mob002 mob085 mob080 mob070 mob027 mob081 mob025 mob044 mob109 hub002 mob024 mob043 mob024 mob043 mob109 hub001 mob002 mob085 mob080 mob070 mob027 mob081 mob025 hub002 trn144 trn059 trn286 mob024 mob043 mob109 mob002 mob085 mob080 mob070 mob027 mob081 mob044 mob109 mob024 mob043,0,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
921288,904072524805589,trn286 trn228 trn090,0,1 1 2
419172,403329549071400,trn059,0,1
20822,10284629301700,trn059,0,1
1810,10005614700400,tsy189 hub001 tsy189 hub002 trn008 trn089 tsy180 trn224 tsy182 tsy180 trn224,0,1 1 1 1 2 1 1 1 1 3 1
318814,304072525455563,trn286 hub001 hub002 trn008,0,2 1 1 1
186488,204043508083200,hub001 act069 act073 hub002,0,1 1 1 1
200713,207343852652200,tsy189,0,1
25559,104072523753571,tsy189 hub001 hub002 ivr065 ivr008 ivr013 hub001 ivr002 ivr047 ivr076 act033 tsy189 hub002,0,1 1 1 1 1 1 1 1 1 1 1 1 1
815228,801124707446355,hub001 hub002 trn280 hub001 hub002 trn144 trn008 hub001 hub002 hub001 hub002 hub001 hub002 trn144 trn281 hub001 hub002 hub001 trn144 hub002 trn144 trn008 hub001 hub002 hub001 hub002 hub001 hub002 hub001 hub002 trn144 tsy182 tsy180,0,1 1 2 1 1 1 4 1 1 1 1 2 2 1 1 1 1 1 3 1 6 1 1 1 1 1 1 1 1 1 4 1 3


In [76]:
def tokenizer(text):
    return text.split()

In [77]:
df['tokenzied_event']=df["event_sequence"].str.split(" ")

In [78]:
df['tokenzied_time']=df["count_sequence"].str.split(" ")

In Python, Word2Vec expects to be given a list of sentences, each of which is a list of words. To make this data setup, we define a function to split our sentences into lists of words and then apply this within another function that splits our texts into lists of sentences

In [79]:
print(model)

Word2Vec(vocab=1909, size=50, alpha=0.025)


# Sentence Vector 

In [80]:
import numpy as np

In [81]:
df.shape

(1006950, 6)

In [82]:
def vectorizer(text):
    return np.array([model[x] for x in text])
    

In [83]:
import gc
gc.collect()

108

In [84]:
test=df.head(900000)

In [85]:
# apply the preprocess function to all reviews
df['vec_text'] = df['tokenzied_event'].apply(vectorizer)

  


In [86]:
df.head()

Unnamed: 0,OCIF_id,event_sequence,attrited,count_sequence,tokenzied_event,tokenzied_time,vec_text
0,1069464216288,opt257,0,2,[opt257],[2],"[[-0.7165557, 2.604979, -2.8657541, 0.43669122..."
1,1069464307179,opt465,0,2,[opt465],[2],"[[-1.8696721, 3.6658206, -2.4916062, -0.635528..."
2,1069464309050,opt302,0,1,[opt302],[1],"[[-2.7070203, -3.2676828, -0.9101156, -0.94065..."
3,1069466113872,opt082 opt137,0,1 1,"[opt082, opt137]","[1, 1]","[[-0.5282699, 2.4475846, -0.76312566, -0.13202..."
4,1069467716456,opt460 opt421,0,2 2,"[opt460, opt421]","[2, 2]","[[-1.9350575, 4.208231, -1.9452233, -0.9303369..."


In [87]:
df[df['OCIF_id']=='920181468892500']

Unnamed: 0,OCIF_id,event_sequence,attrited,count_sequence,tokenzied_event,tokenzied_time,vec_text
980502,920181468892500,trn088 trn083 trn088 trn144 trn060 trn064 trn0...,0,2 10 1 6 1 2 2 168 1 3 1 1 1 3 1 2 9 1 1 2 3 3...,"[trn088, trn083, trn088, trn144, trn060, trn06...","[2, 10, 1, 6, 1, 2, 2, 168, 1, 3, 1, 1, 1, 3, ...","[[-2.9300902, -1.882993, 0.96580446, -0.524666..."


In [88]:
df["count_sequence"][980502]

'2 10 1 6 1 2 2 168 1 3 1 1 1 3 1 2 9 1 1 2 3 3 1 1 4 2 1 2 4 4 1 5 4 3 1 1 1 1 1 2 1 6 1 1 1 4 2 18 1 1 1 2 1 3 1 1 6 1 1 2 55 9 4 2 1 4 56 4 15 2 12 2 1 4 23 2 1 5 1 1 1 3 7 2 6 3 3 1 29 6 6 1 3 4 1 2 1 1 4 1 1 1 2 3 1 2 2 1 2 5 3 2 2 3 1 4 1 2 1 1 2 1 2 10 1 22 1 2 1 2 3 1 1 1 1 1 4 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 6 1 13 2 1 1 1 1 1 39 2 6 1 1 1 1 4 2 2 1 2 2 2 1 1 1 2 3 5 2 2 1 2 1 1 2 2 1 6 3 11 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 5 1 1 14 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 53 2 32 2 1 1 1 1 1 3 2 5 20 5 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 3 1 1 1 2 1 1 1 1 1 2 1 1 1 3 1 9 7 17 1 1 1 1 1 1 2 10 20 1 1 1 1 1 1 2 2 2 5 14 4 1 2 2 1 9 1 26 2 9 1 15 4 12 2 2 1 2 6 20 2 4 1 2 1 1 1 2 1 2 11 9 10 1 1 2 2 1 11 1 1 1 1 9 1 10 1 1 1 9 1 14 70 2 2 2 1 2 6 1 1 1 1 1 1 2 3 1 1 1 1 1 1 1 1 1 

In [89]:
df['vec_text'][980502]

array([[-2.9300902 , -1.882993  ,  0.96580446, ...,  0.80030066,
        -2.1387422 , -1.9184493 ],
       [-0.5885723 , -0.47445378,  0.29939994, ...,  0.38192293,
        -0.91942483, -0.48943508],
       [-2.9300902 , -1.882993  ,  0.96580446, ...,  0.80030066,
        -2.1387422 , -1.9184493 ],
       ...,
       [-5.721148  , -6.4423566 ,  0.1454252 , ..., -3.4164155 ,
        -4.3542423 ,  0.622101  ],
       [-0.5885723 , -0.47445378,  0.29939994, ...,  0.38192293,
        -0.91942483, -0.48943508],
       [-2.9300902 , -1.882993  ,  0.96580446, ...,  0.80030066,
        -2.1387422 , -1.9184493 ]], dtype=float32)

In [90]:
def numpriz(text):
    return np.array([x for x in text]).astype(np.float)

In [91]:
df['vec_cnt'] = df['tokenzied_time'].apply(numpriz)

In [92]:
df.iloc[980502]

OCIF_id                                              920181468892500
event_sequence     trn088 trn083 trn088 trn144 trn060 trn064 trn0...
attrited                                                           0
count_sequence     2 10 1 6 1 2 2 168 1 3 1 1 1 3 1 2 9 1 1 2 3 3...
tokenzied_event    [trn088, trn083, trn088, trn144, trn060, trn06...
tokenzied_time     [2, 10, 1, 6, 1, 2, 2, 168, 1, 3, 1, 1, 1, 3, ...
vec_text           [[-2.9300902, -1.882993, 0.96580446, -0.524666...
vec_cnt            [2.0, 10.0, 1.0, 6.0, 1.0, 2.0, 2.0, 168.0, 1....
Name: 980502, dtype: object

In [93]:
df.head()

Unnamed: 0,OCIF_id,event_sequence,attrited,count_sequence,tokenzied_event,tokenzied_time,vec_text,vec_cnt
0,1069464216288,opt257,0,2,[opt257],[2],"[[-0.7165557, 2.604979, -2.8657541, 0.43669122...",[2.0]
1,1069464307179,opt465,0,2,[opt465],[2],"[[-1.8696721, 3.6658206, -2.4916062, -0.635528...",[2.0]
2,1069464309050,opt302,0,1,[opt302],[1],"[[-2.7070203, -3.2676828, -0.9101156, -0.94065...",[1.0]
3,1069466113872,opt082 opt137,0,1 1,"[opt082, opt137]","[1, 1]","[[-0.5282699, 2.4475846, -0.76312566, -0.13202...","[1.0, 1.0]"
4,1069467716456,opt460 opt421,0,2 2,"[opt460, opt421]","[2, 2]","[[-1.9350575, 4.208231, -1.9452233, -0.9303369...","[2.0, 2.0]"


In [94]:
import gc
gc.collect()

106

In [95]:
def mutlp(x,y):
    z=np.multiply(np.transpose(x),y)
    #print(x.shape[0])
    return list(map(lambda x:x.sum(axis=0)/x.shape[0],z))
    #return list(z)

In [96]:
df['weight_col']=df[['vec_text','vec_cnt']].apply(lambda x: mutlp(x.vec_text,x.vec_cnt), axis=1)

In [97]:
X = pd.DataFrame(df['weight_col'].tolist())

In [98]:
X.shape

(1006950, 50)

In [99]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-1.433111,5.209958,-5.731508,0.873382,-1.761791,-0.173951,4.37362,-4.582479,11.075265,-10.677121,...,5.822658,-1.575693,-0.73408,-0.788835,-0.537356,-4.471012,-5.095016,-6.633667,2.481604,-6.480044
1,-3.739344,7.331641,-4.983212,-1.271056,-2.616382,-0.294269,3.283833,-5.858821,9.568296,-9.817701,...,0.312733,-2.865328,-2.334048,-0.861015,-4.896544,-1.938946,4.658707,-6.747419,1.286169,-4.215518
2,-2.70702,-3.267683,-0.910116,-0.940657,-7.300041,0.589323,-0.006612,-3.485259,0.836814,-6.055137,...,-6.223355,-10.784649,-1.112462,-1.100864,-5.309468,-1.130732,-2.145075,0.037013,1.47231,-4.965487
3,-1.426828,1.85241,-1.614281,-0.114438,-2.819932,-1.379477,0.344765,-2.125137,3.36701,-4.609267,...,0.076467,0.022537,-0.445504,-1.891142,-1.251115,-0.819161,0.934334,-1.805593,-0.036279,-1.130304
4,-3.074671,5.708027,-3.455483,-1.740084,-1.422159,-2.152689,4.925292,-3.803427,8.281549,-8.779402,...,-4.247244,-5.694802,-1.734305,-0.610952,-2.392882,-1.109465,4.7857,-5.106378,-0.321304,-3.15239


In [100]:
final_df=df[['OCIF_id','attrited']].merge(X, how='inner', left_index=True, right_index=True)

In [101]:
final_df.head(3)

Unnamed: 0,OCIF_id,attrited,0,1,2,3,4,5,6,7,...,40,41,42,43,44,45,46,47,48,49
0,1069464216288,0,-1.433111,5.209958,-5.731508,0.873382,-1.761791,-0.173951,4.37362,-4.582479,...,5.822658,-1.575693,-0.73408,-0.788835,-0.537356,-4.471012,-5.095016,-6.633667,2.481604,-6.480044
1,1069464307179,0,-3.739344,7.331641,-4.983212,-1.271056,-2.616382,-0.294269,3.283833,-5.858821,...,0.312733,-2.865328,-2.334048,-0.861015,-4.896544,-1.938946,4.658707,-6.747419,1.286169,-4.215518
2,1069464309050,0,-2.70702,-3.267683,-0.910116,-0.940657,-7.300041,0.589323,-0.006612,-3.485259,...,-6.223355,-10.784649,-1.112462,-1.100864,-5.309468,-1.130732,-2.145075,0.037013,1.47231,-4.965487


In [102]:
#final_df=final_df.iloc[:,final_df.columns !='vec_time']

In [103]:
final_df.shape

(1006950, 52)

In [104]:
final_df.head(5)

Unnamed: 0,OCIF_id,attrited,0,1,2,3,4,5,6,7,...,40,41,42,43,44,45,46,47,48,49
0,1069464216288,0,-1.433111,5.209958,-5.731508,0.873382,-1.761791,-0.173951,4.37362,-4.582479,...,5.822658,-1.575693,-0.73408,-0.788835,-0.537356,-4.471012,-5.095016,-6.633667,2.481604,-6.480044
1,1069464307179,0,-3.739344,7.331641,-4.983212,-1.271056,-2.616382,-0.294269,3.283833,-5.858821,...,0.312733,-2.865328,-2.334048,-0.861015,-4.896544,-1.938946,4.658707,-6.747419,1.286169,-4.215518
2,1069464309050,0,-2.70702,-3.267683,-0.910116,-0.940657,-7.300041,0.589323,-0.006612,-3.485259,...,-6.223355,-10.784649,-1.112462,-1.100864,-5.309468,-1.130732,-2.145075,0.037013,1.47231,-4.965487
3,1069466113872,0,-1.426828,1.85241,-1.614281,-0.114438,-2.819932,-1.379477,0.344765,-2.125137,...,0.076467,0.022537,-0.445504,-1.891142,-1.251115,-0.819161,0.934334,-1.805593,-0.036279,-1.130304
4,1069467716456,0,-3.074671,5.708027,-3.455483,-1.740084,-1.422159,-2.152689,4.925292,-3.803427,...,-4.247244,-5.694802,-1.734305,-0.610952,-2.392882,-1.109465,4.7857,-5.106378,-0.321304,-3.15239


In [105]:
final_df.to_csv('..\jrn_universe_201909wk1_embed.csv')