# doc2vec: How To Prep Document Vectors For Modeling

### Train Our Own Model

In [2]:
# Read in data, clean it, split it into train/test, and then train a doc2vec model
import gensim
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('../data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

tagged_docs_tr = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train)]

d2v_model = gensim.models.Doc2Vec(tagged_docs_tr, vector_size=50, window=2, min_count=2)

In [3]:
# What does a document vector look like again?
d2v_model.infer_vector(['convert', 'words', 'to', 'vectors'])

array([ 0.00376448,  0.00881088, -0.00633844, -0.01280787, -0.00656802,
       -0.01548709, -0.00685113,  0.03082145, -0.04048647, -0.02898078,
        0.00307652, -0.0160125 ,  0.0033181 , -0.00146115, -0.01781437,
        0.02310436,  0.02520367, -0.00263706, -0.031695  , -0.0105246 ,
        0.01659888,  0.019787  ,  0.04994716, -0.00468729,  0.01941932,
        0.01921475, -0.0271068 ,  0.00631096, -0.02479362,  0.00275671,
        0.00405054, -0.00311382, -0.00487849, -0.00797768, -0.0087992 ,
        0.01820464,  0.01291765, -0.0089206 , -0.0063187 , -0.00841609,
        0.01831621,  0.00079815,  0.00884872, -0.00528561,  0.03149528,
        0.00864177,  0.01486472, -0.02331213,  0.00382285,  0.01524783],
      dtype=float32)

In [5]:
# How do we prepare these vectors to be used in a machine learning model?
vector = [[d2v_model.infer_vector(words)] for words in X_test]
vector[0]

[array([ 0.00850308,  0.00171008, -0.00767133, -0.01990022,  0.00996763,
         0.00611724,  0.02718367,  0.01013524, -0.02815638, -0.00202135,
         0.00198823, -0.02130712, -0.0037585 ,  0.01285108, -0.02046327,
         0.01489287,  0.03012642,  0.00912198, -0.00421582, -0.02701136,
         0.00059624,  0.00020792,  0.0262967 , -0.01221213,  0.00601564,
         0.00692054, -0.01141202,  0.0060779 , -0.02588733, -0.0037358 ,
        -0.01296418,  0.01563779,  0.01029546,  0.02657419, -0.01008443,
        -0.00138884,  0.03124498,  0.0009907 ,  0.02810177, -0.04392348,
         0.00470678, -0.00349119,  0.04400412,  0.00431659,  0.0288401 ,
         0.00242608, -0.01398826, -0.00159135,  0.00901097,  0.0227876 ],
       dtype=float32)]