# word2vec: How To Prep Word Vectors For Modeling

### Train Our Own Model

In [4]:
# Read in the data, clean it, split it into train and test sets, and then train a word2vec model
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

message = pd.read_csv("C:/Users/visha/Downloads/Advance_NLP_Python_ML/data/spam.csv", encoding="latin-1")
messages = message.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]


messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

### Prep Word Vectors

In [None]:
# Generate a list of words the word2vec model learned word vectors for

In [7]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence
w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key])
                     for ls in X_test])

  w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key])


In [10]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(w2v_vect):
    print(len(X_test.iloc[i]), len(v))

12 12
14 13
50 44
12 10
6 4
3 3
25 23
13 12
17 15
4 2
28 23
7 7
22 21
8 7
2 2
18 14
4 4
21 20
22 20
9 9
6 6
7 7
8 8
7 7
7 5
4 4
13 11
13 12
14 13
8 8
2 2
10 9
10 8
26 24
21 17
19 17
18 13
6 6
19 17
7 7
19 17
9 9
5 5
65 59
12 9
13 13
16 11
9 9
11 9
4 4
5 5
4 4
7 6
14 14
25 25
7 6
74 67
28 28
26 24
13 10
28 27
10 10
22 22
12 11
12 11
24 24
7 7
9 9
4 3
13 12
5 3
14 11
27 23
8 7
5 5
21 21
12 12
6 6
10 8
14 13
29 29
19 14
10 7
7 4
8 7
12 9
1 1
9 7
9 8
27 27
56 54
1 1
8 8
40 37
8 7
23 22
11 11
15 15
9 8
24 18
7 7
22 19
16 14
19 17
20 19
9 7
10 10
29 28
18 17
23 23
4 2
7 7
9 9
8 8
5 5
23 21
5 5
20 20
17 16
23 10
4 4
11 10
5 5
4 4
19 15
8 7
7 5
10 9
11 7
4 4
10 7
15 14
4 4
3 3
7 6
6 6
6 6
46 43
14 12
8 8
6 6
23 23
30 22
29 29
5 3
8 6
21 20
17 17
8 7
30 29
7 6
22 17
24 23
4 3
11 8
9 8
24 19
9 9
13 12
8 7
5 5
9 8
6 6
13 13
5 5
6 5
16 15
9 8
25 25
6 5
4 4
10 10
24 19
12 10
6 3
24 23
4 4
5 4
5 5
6 6
21 9
14 14
22 20
26 24
25 25
8 6
13 13
9 9
16 14
8 6
13 12
10 9
30 29
27 25
4 4
79 75
5 5
12 12
4 2

In [11]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
w2v_vect_avg = []

for vect in w2v_vect:
    if len(vect)!=0:
        w2v_vect_avg.append(vect.mean(axis=0))
    else:
        w2v_vect_avg.append(np.zeros(100))

In [12]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(w2v_vect_avg):
    print(len(X_test.iloc[i]), len(v))

12 100
14 100
50 100
12 100
6 100
3 100
25 100
13 100
17 100
4 100
28 100
7 100
22 100
8 100
2 100
18 100
4 100
21 100
22 100
9 100
6 100
7 100
8 100
7 100
7 100
4 100
13 100
13 100
14 100
8 100
2 100
10 100
10 100
26 100
21 100
19 100
18 100
6 100
19 100
7 100
19 100
9 100
5 100
65 100
12 100
13 100
16 100
9 100
11 100
4 100
5 100
4 100
7 100
14 100
25 100
7 100
74 100
28 100
26 100
13 100
28 100
10 100
22 100
12 100
12 100
24 100
7 100
9 100
4 100
13 100
5 100
14 100
27 100
8 100
5 100
21 100
12 100
6 100
10 100
14 100
29 100
19 100
10 100
7 100
8 100
12 100
1 100
9 100
9 100
27 100
56 100
1 100
8 100
40 100
8 100
23 100
11 100
15 100
9 100
24 100
7 100
22 100
16 100
19 100
20 100
9 100
10 100
29 100
18 100
23 100
4 100
7 100
9 100
8 100
5 100
23 100
5 100
20 100
17 100
23 100
4 100
11 100
5 100
4 100
19 100
8 100
7 100
10 100
11 100
4 100
10 100
15 100
4 100
3 100
7 100
6 100
6 100
46 100
14 100
8 100
6 100
23 100
30 100
29 100
5 100
8 100
21 100
17 100
8 100
30 100
7 100
22 100
24 