In [6]:
# Read in the data and clean up column names
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)
messages = pd.read_excel('file_Preprocessed.xlsx')
messages = messages[['Feature','Review']].copy()
messages.head()

Unnamed: 0,Feature,Review
0,No,repost from january 13 2004 with a better fit title
1,No,i bought the nomad when i went home on leave and it worked great for about a month
2,No,i am a soldier serving in baghdad right now
3,No,you will be sorry if you do
4,No,whatever you do do nt buy this product


In [7]:
# Clean data using the built in cleaner in gensim
messages['text_clean'] = messages['Review'].apply(lambda x: gensim.utils.simple_preprocess(x))
messages.head()

Unnamed: 0,Feature,Review,text_clean
0,No,repost from january 13 2004 with a better fit title,"[repost, from, january, with, better, fit, title]"
1,No,i bought the nomad when i went home on leave and it worked great for about a month,"[bought, the, nomad, when, went, home, on, leave, and, it, worked, great, for, about, month]"
2,No,i am a soldier serving in baghdad right now,"[am, soldier, serving, in, baghdad, right, now]"
3,No,you will be sorry if you do,"[you, will, be, sorry, if, you, do]"
4,No,whatever you do do nt buy this product,"[whatever, you, do, do, nt, buy, this, product]"


In [8]:
# Encoding the label column
messages['Feature']=messages['Feature'].map({'Yes':1,'No':0})
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split (messages['text_clean'], messages['Feature'] , test_size=0.2)

In [9]:
# Train the word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

In [10]:
w2v_model.wv.index_to_key

['the',
 'to',
 'and',
 'it',
 'is',
 'of',
 'you',
 'this',
 'for',
 'that',
 'with',
 'in',
 'have',
 'my',
 'on',
 'but',
 'not',
 'as',
 'was',
 'are',
 'if',
 'can',
 'so',
 'player',
 'be',
 'one',
 'all',
 'use',
 'camera',
 'an',
 'at',
 'or',
 'ipod',
 'get',
 'had',
 'very',
 'phone',
 'from',
 'its',
 'has',
 'about',
 'just',
 'your',
 'do',
 'they',
 'great',
 'will',
 'would',
 'me',
 'up',
 'out',
 'when',
 'good',
 'no',
 'like',
 'nt',
 'only',
 'which',
 'battery',
 'than',
 'more',
 'other',
 'there',
 'quality',
 'also',
 'software',
 'after',
 'what',
 'any',
 'some',
 'easy',
 'time',
 'zen',
 'router',
 'by',
 'mp',
 'even',
 'well',
 'buy',
 'did',
 'first',
 'does',
 'music',
 'now',
 'much',
 'really',
 'features',
 'because',
 'better',
 'am',
 'been',
 'used',
 'little',
 'creative',
 'then',
 'work',
 'product',
 'problem',
 'them',
 'sound',
 'most',
 'want',
 'computer',
 'using',
 'their',
 'thing',
 'dont',
 'got',
 'need',
 'still',
 'pictures',
 'pric

In [13]:
# Find the most similar words to "king" based on word vectors from our trained model
w2v_model.wv.most_similar('size')

[('design', 0.9994927644729614),
 ('little', 0.9994797110557556),
 ('in', 0.9994754195213318),
 ('nokia', 0.9994633793830872),
 ('looks', 0.9994404911994934),
 ('light', 0.99944007396698),
 ('because', 0.9994229674339294),
 ('which', 0.9994106292724609),
 ('and', 0.9994075894355774),
 ('storage', 0.9993947148323059)]

In [14]:
words = set(w2v_model.wv.index_to_key )
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test])

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])


In [15]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(X_train_vect):
    print(len(X_train.iloc[i]), len(v))

20 18
4 4
13 12
17 14
17 16
18 18
4 4
9 9
28 28
13 13
15 15
16 15
20 18
14 14
26 24
10 9
17 17
3 3
11 11
10 9
16 14
12 12
26 26
11 11
34 34
4 3
32 32
15 15
5 5
10 10
8 8
15 15
18 18
23 23
14 14
11 11
4 4
30 30
34 33
15 15
29 28
12 12
16 16
39 39
16 16
12 12
10 9
8 8
30 28
30 30
5 5
15 14
11 11
20 19
17 17
6 6
10 10
12 12
23 22
7 7
20 20
39 38
8 6
12 12
11 10
23 22
30 28
16 16
10 9
26 23
3 3
2 2
21 21
10 10
13 13
15 15
14 14
6 6
8 8
21 21
24 23
14 12
7 7
14 13
26 26
10 10
15 15
12 12
19 19
9 9
16 16
11 11
19 19
29 29
9 9
16 14
6 6
9 8
4 4
6 5
17 17
4 4
15 13
22 21
9 9
3 3
23 21
14 14
7 7
22 22
13 13
26 26
25 23
9 9
12 11
7 6
9 9
4 4
6 6
8 8
29 23
15 15
15 15
24 20
19 19
4 4
22 22
7 6
8 8
35 34
10 10
12 12
3 3
22 22
51 50
17 16
14 14
4 4
27 27
23 23
21 20
5 5
13 12
6 6
19 19
25 25
14 14
30 28
11 11
16 15
26 22
11 11
8 8
24 23
25 24
21 20
7 7
10 10
6 6
11 11
34 34
43 40
9 9
22 21
12 12
16 16
35 35
24 24
7 7
28 28
19 19
10 10
7 7
20 18
4 4
6 6
34 32
13 10
21 20
19 18
5 5
9 9
22 21
18 18
9 

In [16]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [17]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(X_train_vect_avg):
    print(len(X_train.iloc[i]), len(v))

20 100
4 100
13 100
17 100
17 100
18 100
4 100
9 100
28 100
13 100
15 100
16 100
20 100
14 100
26 100
10 100
17 100
3 100
11 100
10 100
16 100
12 100
26 100
11 100
34 100
4 100
32 100
15 100
5 100
10 100
8 100
15 100
18 100
23 100
14 100
11 100
4 100
30 100
34 100
15 100
29 100
12 100
16 100
39 100
16 100
12 100
10 100
8 100
30 100
30 100
5 100
15 100
11 100
20 100
17 100
6 100
10 100
12 100
23 100
7 100
20 100
39 100
8 100
12 100
11 100
23 100
30 100
16 100
10 100
26 100
3 100
2 100
21 100
10 100
13 100
15 100
14 100
6 100
8 100
21 100
24 100
14 100
7 100
14 100
26 100
10 100
15 100
12 100
19 100
9 100
16 100
11 100
19 100
29 100
9 100
16 100
6 100
9 100
4 100
6 100
17 100
4 100
15 100
22 100
9 100
3 100
23 100
14 100
7 100
22 100
13 100
26 100
25 100
9 100
12 100
7 100
9 100
4 100
6 100
8 100
29 100
15 100
15 100
24 100
19 100
4 100
22 100
7 100
8 100
35 100
10 100
12 100
3 100
22 100
51 100
17 100
14 100
4 100
27 100
23 100
21 100
5 100
13 100
6 100
19 100
25 100
14 100
30 100
11 10

In [18]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [19]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [20]:
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.616 / Recall: 0.579 / Accuracy: 0.637
