# Compare NLP Techniques: Build Model On word2vec Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import gensim
import numpy as np
import pandas as pd

X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_train = pd.read_csv('../data/y_train.csv')
y_test = pd.read_csv('../data/y_test.csv')

### Create word2vec Vectors

In [2]:
# Train a basic word2vec model
w2v_model = gensim.models.Word2Vec(X_train, vector_size=100, window=5, min_count=2)

In [3]:
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index_to_key)
X_train_vect = [np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train['clean_text']]
X_test_vect = [np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_test['clean_text']]

In [4]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training)
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [5]:
# What does the unaveraged version look like?
X_train_vect[0]

array([[-0.00861969,  0.00366574,  0.00518988, ..., -0.00239151,
        -0.00951009,  0.00450588],
       [-0.00053623,  0.00023643,  0.00510335, ..., -0.00704156,
         0.00090146,  0.00639253],
       [-0.00053623,  0.00023643,  0.00510335, ..., -0.00704156,
         0.00090146,  0.00639253],
       ...,
       [-0.00861969,  0.00366574,  0.00518988, ..., -0.00239151,
        -0.00951009,  0.00450588],
       [-0.00053623,  0.00023643,  0.00510335, ..., -0.00704156,
         0.00090146,  0.00639253],
       [-0.00053623,  0.00023643,  0.00510335, ..., -0.00704156,
         0.00090146,  0.00639253]], dtype=float32)

In [6]:
# What does the averaged version look like?
X_train_vect_avg[0]

array([-3.4756670e-03,  1.4834519e-03,  5.1348167e-03,  7.8211511e-03,
       -3.2048158e-03, -6.7716693e-03,  4.5122327e-03,  7.9090949e-03,
       -4.2243819e-03, -4.6397899e-03,  4.5475126e-03, -4.0190993e-03,
       -4.9233041e-03,  6.7542288e-03, -1.8737242e-03,  1.4718686e-03,
        4.3033678e-03,  3.3696441e-03, -6.6502839e-03, -6.2171775e-03,
        5.5068969e-03,  1.5832458e-03,  7.3507982e-03, -3.0993274e-03,
        6.5013450e-03, -1.1072633e-03, -2.3960122e-03,  5.2702515e-03,
       -5.4190694e-03, -6.4289787e-05, -1.1565160e-03, -2.1781882e-03,
        5.8517708e-03, -6.7287926e-03, -8.4826468e-05, -2.1978912e-04,
        7.6460331e-03, -1.5556264e-03,  3.4972839e-03,  3.4704831e-04,
       -3.2393255e-03,  6.4482132e-04, -8.9036869e-03, -2.9241627e-03,
       -1.1495511e-03,  2.6821815e-03, -2.7158530e-03,  5.5564134e-03,
        3.7198416e-03,  6.5265610e-03, -2.3486488e-03, -5.9728726e-04,
       -2.7074234e-03,  1.7862306e-03,  5.0667683e-03,  2.0835015e-04,
      

### Fit RandomForestClassifier On Top Of Word Vectors

In [7]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [8]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [9]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 0.594 / Recall: 0.27 / Accuracy: 0.875
