# Word Embeddings
A word embedding is an approach to provide a dense vector representation of words that capture something about their meaning

Word embeddings are an improvement over simpler bag-of-word model word encoding schemes like word counts and frequencies that result in large and sparse vectors (mostly 0 values) that describe documents but not the meaning of the words.


Word embeddings work by using an algorithm to train a set of fixed-length dense and continuous-valued vectors based on a large corpus of text. Each word is represented by a point in the embedding space and these points are learned and moved around based on the words that surround the target word.

Gensim is an open source Python library for natural language processing, with a focus on topic modeling

* size: (default 100) The number of dimensions of the embedding, e.g. the length of the dense vector to represent each token (word).
* window: (default 5) The maximum distance between a target word and words around the target word.
* min_count: (default 5) The minimum count of words to consider when training the model; words with an occurrence less than this count will be ignored.
* workers: (default 3) The number of threads to use while training.
* sg: (default 0 or CBOW) The training algorithm, either CBOW (0) or skip gram (1

In [1]:
import gensim



In [2]:
from gensim.models import Word2Vec

In [3]:
from gensim.models import KeyedVectors

In [2]:
import pandas as pd
import os

In [3]:
import numpy as np

In [17]:
#load model
model = Word2Vec.load(".\output\gensim-model.cpkt")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [7]:
#word_vectors = model.wv

In [8]:
#from gensim.test.utils import get_tmpfile

In [9]:
#fname = get_tmpfile("C:\\Users\\tsun04\\event_sequence_embedding\\src\\vectors.kv")

In [10]:
#word_vectors.save(fname)

In [4]:
#load test data
df=pd.read_csv('..\JRN_UNIVERSE_7DAY_flatten_samp.csv', encoding='utf-8')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
#check null values and remove
np.where(pd.isnull(df))

(array([   7818,   19426,   42446,   49962,   57565,   63748,   67465,
          73972,   74558,   78040,   83725,   87329,   97971,  102736,
         116428,  119768,  120540,  129948,  141299,  154061,  158249,
         162649,  177474,  182177,  205981,  224128,  236370,  238330,
         253239,  263498,  263798,  267067,  283979,  285213,  313696,
         341504,  345960,  355225,  362031,  380800,  403331,  408961,
         413805,  442564,  443455,  445262,  449176,  464588,  471461,
         471715,  481821,  484027,  496570,  500626,  504424,  534417,
         543590,  556133,  567720,  574902,  593059,  594953,  600363,
         604929,  605020,  609900,  619736,  628423,  629349,  631697,
         649781,  650583,  658154,  665909,  676763,  683868,  698642,
         701679,  706627,  708770,  711376,  729694,  736447,  744101,
         745342,  745562,  753291,  767821,  770520,  782187,  783634,
         788079,  790624,  803862,  803928,  810379,  825992,  828868,
      

In [6]:
np.where(df.applymap(lambda x: x == ' '))

(array([], dtype=int64), array([], dtype=int64))

In [6]:
#drop null value rows
df=df.dropna()

In [7]:
df.shape

(1005656, 3)

In [10]:
df.dtypes

OCIF_id           object
event_sequence    object
attrited           int64
dtype: object

In [11]:
from sklearn.feature_extraction import DictVectorizer as DV

vectorizer = DV( sparse = False )

In [12]:
vec_x_cat_train = vectorizer.fit_transform( df.event_sequence )

AttributeError: 'str' object has no attribute 'items'

In [9]:
#double check
np.where(pd.isnull(df))

(array([], dtype=int64), array([], dtype=int64))

In [12]:
df.sample(20)

Unnamed: 0,OCIF_id,event_sequence,attrited
62275,1124731177169,tsy180,0
477960,418273581743600,mob024 mob043 mob109 mob085 mob002 mob080 mob0...,0
653674,613136572468700,mob024 mob043 mob109 hub001 mob085 mob002 mob0...,0
973369,919085282597700,tsy182 tsy180,0
204210,209205760720500,mob024 mob043 mob109 hub001 mob002 mob085 mob0...,0
898250,823205186081800,hub001 hub002 hub001 hub002 hub001 opt454 hub0...,0
336788,310273775033900,tsy180 tsy138 tsy182 tsy180 tsy138 tsy180 tsy056,0
871486,818052595384700,hub001 hub002 hub001 hub002 mob024 mob043 mob1...,0
818457,801179169693800,tsy182 tsy180 tsy056 tsy043 hub001 act069 hub002,0
930119,908245474492400,tsy182 tsy180 tsy056,0


In [12]:
def tokenizer(text):
    return text.split()

In [26]:
df['tokenzied_event']=df["event_sequence"].str.split(" ")

In [35]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [28]:
df.head()

Unnamed: 0,OCIF_id,event_sequence,attrited,tokenzied_event
0,1069464309050,opt302,0,[opt302]
1,1069465688496,opt527,0,[opt527]
2,1069466113872,opt082 opt137,0,"[opt082, opt137]"
3,1069466366258,opt430 opt101,0,"[opt430, opt101]"
4,1069467716456,opt460 opt421,0,"[opt460, opt421]"


In [36]:
labEncode=LabelEncoder()

In [None]:
integer_encoded = label_encoder.fit_transform(values)

In [30]:
enc = OneHotEncoder(handle_unknown='ignore')

In [32]:
X = [['Male', 1], ['Female', 3], ['Female', 2]]

In [34]:
enc.fit(X).transform([['Female', 1], ['Male', 2]]).toarray()

array([[1., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0.]])

In [44]:
#enc.fit(df.event_sequence)

In [18]:
print(model)

Word2Vec(vocab=1583, size=50, alpha=0.025)


In [None]:
# summarize vocabulary
words = list(model.wv.vocab)

# Sentence Vector 

In [25]:
df.head()

Unnamed: 0,OCIF_id,event_sequence,attrited
0,1069464309050,opt302,0
1,1069465688496,opt527,0
2,1069466113872,opt082 opt137,0
3,1069466366258,opt430 opt101,0
4,1069467716456,opt460 opt421,0


In [15]:
def vectorizer(text):
    return np.array([model[x] for x in text])
    

In [16]:
df['tokenzied_event'].shape

(1005656,)

In [19]:
# apply the preprocess function to all sequence embeddings
df['event_emb'] = df['tokenzied_event'].apply(vectorizer)

  


In [20]:
def mutlp(x,y):
    #z=np.multiply(np.transpose(x),y)
    #print(x.shape[0])
    return list(map(lambda x:x.sum(axis=0)/x.shape[0],z))
    #return list(z)

In [21]:
df['seq_vec']=list(map(lambda x:x.sum(axis=0),df.event_emb))

In [24]:
df[['OCIF_id','seq_vec']].head()

KeyError: "['seq_vec'] not in index"

In [None]:
#save as pickle file
#df1[['OCIF_id','seq_vec']].to_csv("JRN_UNIVERSE_7DAY_embedding.csv")

In [None]:
#X=df1['seq_vec'].head(100).apply(' '.join(map(str,df1['seq_vec'])))

In [23]:
df.head()

Unnamed: 0,OCIF_id,event_sequence,attrited,tokenzied_event,event_emb,seq_vec
0,1069464309050,opt302,0,[opt302],"[[6.523177, -0.15946983, -1.4525222, 2.91183, ...","[6.523177, -0.15946983, -1.4525222, 2.91183, 2..."
1,1069465688496,opt527,0,[opt527],"[[1.6753732, -1.172248, 1.5994278, 0.6180993, ...","[1.6753732, -1.172248, 1.5994278, 0.6180993, -..."
2,1069466113872,opt082 opt137,0,"[opt082, opt137]","[[2.5026407, 0.65573853, 1.6297283, 0.05148467...","[3.4871821, 0.18348768, 3.2911046, 0.3472822, ..."
3,1069466366258,opt430 opt101,0,"[opt430, opt101]","[[0.08659516, -0.091615215, 0.40832502, 0.0339...","[0.04472958, -0.29403582, 0.73543525, 0.157512..."
4,1069467716456,opt460 opt421,0,"[opt460, opt421]","[[1.4177979, -0.87378657, 2.5378127, -0.596555...","[1.5568426, -1.5212889, 3.9356565, -0.835732, ..."


In [33]:
#drop additional columns to save space
df.drop(['tokenzied_event', 'event_emb','event_sequence'], axis=1)

Unnamed: 0,OCIF_id,attrited,seq_vec,seq_char
0,001069464309050,0,"[6.523177, -0.15946983, -1.4525222, 2.91183, 2...","6.523177, -0.15946983, -1.4525222, 2.91183, 2...."
1,001069465688496,0,"[1.6753732, -1.172248, 1.5994278, 0.6180993, -...","1.6753732, -1.172248, 1.5994278, 0.6180993, -1..."
2,001069466113872,0,"[3.4871821, 0.18348768, 3.2911046, 0.3472822, ...","3.4871821, 0.18348768, 3.2911046, 0.3472822, -..."
3,001069466366258,0,"[0.04472958, -0.29403582, 0.73543525, 0.157512...","0.04472958, -0.29403582, 0.73543525, 0.1575122..."
4,001069467716456,0,"[1.5568426, -1.5212889, 3.9356565, -0.835732, ...","1.5568426, -1.5212889, 3.9356565, -0.835732, -..."
5,001069468089456,0,"[0.9845413, -0.47225085, 1.6613762, 0.29579753...","0.9845413, -0.47225085, 1.6613762, 0.29579753,..."
6,001069469988298,0,"[3.1271832, 0.7906799, 2.7256174, -0.43513376,...","3.1271832, 0.7906799, 2.7256174, -0.43513376, ..."
7,001069470014951,0,"[2.3696413, 0.29870117, 2.1577275, 0.7650717, ...","2.3696413, 0.29870117, 2.1577275, 0.7650717, -..."
8,001069470757159,0,"[4.6814413, -0.34848756, 4.1378794, -0.5670654...","4.6814413, -0.34848756, 4.1378794, -0.5670654,..."
9,001069470953496,0,"[0.5121266, -0.40575135, 1.5867714, -0.2651256...","0.5121266, -0.40575135, 1.5867714, -0.2651256,..."


In [25]:

df['seq_char']=df['seq_vec'].apply(lambda x: ', '.join(map(str, x)))

In [32]:
df.head()

Unnamed: 0,OCIF_id,event_sequence,attrited,tokenzied_event,event_emb,seq_vec,seq_char
0,1069464309050,opt302,0,[opt302],"[[6.523177, -0.15946983, -1.4525222, 2.91183, ...","[6.523177, -0.15946983, -1.4525222, 2.91183, 2...","6.523177, -0.15946983, -1.4525222, 2.91183, 2...."
1,1069465688496,opt527,0,[opt527],"[[1.6753732, -1.172248, 1.5994278, 0.6180993, ...","[1.6753732, -1.172248, 1.5994278, 0.6180993, -...","1.6753732, -1.172248, 1.5994278, 0.6180993, -1..."
2,1069466113872,opt082 opt137,0,"[opt082, opt137]","[[2.5026407, 0.65573853, 1.6297283, 0.05148467...","[3.4871821, 0.18348768, 3.2911046, 0.3472822, ...","3.4871821, 0.18348768, 3.2911046, 0.3472822, -..."
3,1069466366258,opt430 opt101,0,"[opt430, opt101]","[[0.08659516, -0.091615215, 0.40832502, 0.0339...","[0.04472958, -0.29403582, 0.73543525, 0.157512...","0.04472958, -0.29403582, 0.73543525, 0.1575122..."
4,1069467716456,opt460 opt421,0,"[opt460, opt421]","[[1.4177979, -0.87378657, 2.5378127, -0.596555...","[1.5568426, -1.5212889, 3.9356565, -0.835732, ...","1.5568426, -1.5212889, 3.9356565, -0.835732, -..."


In [40]:
X=pd.DataFrame(df['seq_vec'].astype(str).values.tolist())

KeyboardInterrupt: 

In [27]:
import gc
gc.collect()

211

In [31]:
df.head()

Unnamed: 0,OCIF_id,event_sequence,attrited,tokenzied_event,event_emb,seq_vec,seq_char
0,1069464309050,opt302,0,[opt302],"[[6.523177, -0.15946983, -1.4525222, 2.91183, ...","[6.523177, -0.15946983, -1.4525222, 2.91183, 2...","6.523177, -0.15946983, -1.4525222, 2.91183, 2...."
1,1069465688496,opt527,0,[opt527],"[[1.6753732, -1.172248, 1.5994278, 0.6180993, ...","[1.6753732, -1.172248, 1.5994278, 0.6180993, -...","1.6753732, -1.172248, 1.5994278, 0.6180993, -1..."
2,1069466113872,opt082 opt137,0,"[opt082, opt137]","[[2.5026407, 0.65573853, 1.6297283, 0.05148467...","[3.4871821, 0.18348768, 3.2911046, 0.3472822, ...","3.4871821, 0.18348768, 3.2911046, 0.3472822, -..."
3,1069466366258,opt430 opt101,0,"[opt430, opt101]","[[0.08659516, -0.091615215, 0.40832502, 0.0339...","[0.04472958, -0.29403582, 0.73543525, 0.157512...","0.04472958, -0.29403582, 0.73543525, 0.1575122..."
4,1069467716456,opt460 opt421,0,"[opt460, opt421]","[[1.4177979, -0.87378657, 2.5378127, -0.596555...","[1.5568426, -1.5212889, 3.9356565, -0.835732, ...","1.5568426, -1.5212889, 3.9356565, -0.835732, -..."


In [28]:
#df1[['OCIF_id','seq_vec']].to_parquet('.\jrn_universe_1wk.parq.gzip',engine='fastparquet' ,compression='gzip')

In [34]:
df[['OCIF_id','attrited','seq_char']].to_csv('JRN_UNIVERSE_7DAY_embedding_samp.csv')

In [36]:
import csv

In [23]:
#df[['OCIF_id','attrited','seq_char']].head(80).to_csv('JRN_UNIVERSE_7DAY_embedding_samptest.csv',header=False)

In [33]:
#from pandas import HDFStore

In [36]:
#store = HDFStore('store.h5')

#store['df'] = df[['OCIF_id','seq_vec']]

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_values] [items->['OCIF_id', 'seq_vec']]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
#df.to_pickle('test.pkl')

In [None]:
#df2=df[['OCIF_id','seq_vec']]

In [None]:
#df3=df2.reset_index()

In [None]:
#df3.head()

In [46]:
#do not use due to memory usage high
#X = pd.DataFrame(df['seq_vec'].tolist())

In [None]:
#df3 = df2.teams.apply(pd.Series)

In [None]:
#X.head()

In [None]:
#final_df=df2.merge(X, how='inner', left_index=True, right_index=True)

In [None]:
#final_df=pd.concat([df3.OCIF_id, X],axis=1,ignore_index=True)

In [None]:
#final_df.head()

In [None]:
#final_df=final_df.iloc[:,final_df.columns !='vec_time']

In [None]:
#final_df.shape

In [None]:
#

In [None]:
# try one hot encoding

In [13]:
dftall=pd.read_csv('.\JRN_UNIVERSE_7DAY_wt_targ.csv', encoding='utf-8')

  interactivity=interactivity, compiler=compiler, result=result)


In [14]:
dftall.head()

Unnamed: 0,OCIF_id,TIMESTAMP,nbr_text,attrited
0,1069464216288,05SEP2019:18:36:26.000000,opt257,0
1,1069464307179,06SEP2019:10:28:32.000000,opt465,0
2,1069464309050,04SEP2019:15:51:23.000000,opt302,0
3,1069464634763,06SEP2019:13:50:24.000000,opt465,0
4,1069465282159,04SEP2019:11:03:10.000000,opt257,0


In [None]:
dftall['nbr_text']

In [15]:
dfDummies = pd.get_dummies(dftall['nbr_text'])

MemoryError: 

In [22]:
dfDummies.shape

(10000, 371)

In [88]:
pdall=pd.concat([dftall.OCIF_id, dfDummies],axis=1) 

In [71]:
def norm_by_data2(x):
    # x is a DataFrame of group values
    if X>0:
        return 1
    else:
        return 0

In [66]:
pdabc=pdall.groupby('OCIF_id').sum()

In [74]:
pdabc.index

Index(['0-1', '001069464216288', '001069464307179', '001069464309050',
       '001069464634763', '001069465282159', '001069465688496',
       '001069465727357', '001069466027684', '001069466050753',
       ...
       '10003519109600', '10003760708200', '10003767608200', '10003768802300',
       '10003770109800', '10003784000900', '10003799507700', '10003811306400',
       '10003816901600', '10003819808800'],
      dtype='object', name='OCIF_id', length=7712)

In [83]:
pdabc1=pdabc.applymap(lambda x: 1 if x > 0 else 0)

In [84]:
pdabc1

Unnamed: 0_level_0,act002,act003,act010,act012,act014,act015,act016,act017,act019,act020,...,tsy091,tsy093,tsy104,tsy106,tsy134,tsy138,tsy178,tsy180,tsy182,tsy189
OCIF_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
001069464216288,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
001069464307179,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
001069464309050,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
001069464634763,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
001069465282159,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
001069465688496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
001069465727357,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
001069466027684,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
001069466050753,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
pdabc_corr=pdabc.corr()

In [53]:
pdabc_corr['act020'].sort_values()

opt302   -0.003626
opt454   -0.002815
act087   -0.002737
opt257   -0.002723
opt137   -0.002558
opt256   -0.002310
opt527   -0.002304
opt252   -0.002214
opt465   -0.002087
opt460   -0.002076
opt082   -0.001997
opt290   -0.001868
act016   -0.001795
opt084   -0.001785
opt087   -0.001689
opt253   -0.001688
opt456   -0.001606
opt509   -0.001348
act021   -0.001204
opt089   -0.001195
opt498   -0.001183
opt119   -0.001159
opt294   -0.001113
act114   -0.001090
act012   -0.001067
act093   -0.001066
act091   -0.001058
act019   -0.001009
act015   -0.001001
act092   -0.000983
            ...   
opt303   -0.000130
opt095   -0.000130
opt015   -0.000130
opt511   -0.000130
opt438   -0.000130
opt428   -0.000130
opt124   -0.000130
opt129   -0.000130
opt573   -0.000130
opt199   -0.000130
opt407   -0.000130
opt353   -0.000130
opt434   -0.000130
opt026   -0.000130
opt368   -0.000130
opt021   -0.000130
opt243   -0.000130
sr319    -0.000130
act079   -0.000130
opt065   -0.000130
opt537   -0.000130
opt233   -0.

In [86]:
pdabc1.to_csv('test.csv')