# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_train = pd.read_csv('../data/y_train.csv')
y_test = pd.read_csv('../data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['better', 'still', 'catch', 'let', 'ask', 'se..."
1,"['yup', 'bathe', 'liao']"
2,"['shop', 'till', 'u', 'drop', 'either', '10k',..."
3,"['cant', 'right', 'second', 'gotta', 'hit', 'p..."
4,"['thts', 'wat', 'wright', 'brother', 'fly']"


### Create TF-IDF Vectors

In [2]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [3]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'better': 1400,
 'still': 6851,
 'catch': 1788,
 'let': 4321,
 'ask': 1160,
 'sell': 6342,
 'ltgt': 4499,
 'yup': 8211,
 'bathe': 1320,
 'liao': 4328,
 'shop': 6446,
 'till': 7283,
 'drop': 2592,
 'either': 2682,
 '10k': 257,
 '5k': 609,
 '500': 583,
 'cash': 1776,
 '100': 241,
 'travel': 7419,
 'voucher': 7719,
 'call': 1689,
 '09064011000': 195,
 'ntt': 5159,
 'po': 5589,
 'box': 1531,
 'cr01327bt': 2163,
 'fixedline': 3014,
 'cost': 2133,
 '150ppm': 315,
 'mobile': 4815,
 'vary': 7646,
 'cant': 1738,
 'right': 6118,
 'second': 6309,
 'gotta': 3354,
 'hit': 3601,
 'people': 5465,
 'first': 3006,
 'thts': 7266,
 'wat': 7792,
 'wright': 8025,
 'brother': 1605,
 'fly': 3048,
 'give': 3284,
 'one': 5260,
 'miss': 4781,
 'number': 5165,
 'please': 5572,
 'office': 5215,
 'na': 4957,
 'jus': 4075,
 'finish': 2992,
 'blowing': 1476,
 'hair': 3453,
 'dinner': 2458,
 'already': 992,
 'didnt': 2430,
 'work': 7990,
 'oh': 5231,
 'ok': 5238,
 'goodnight': 3338,
 'ill': 3792,
 'fix': 3012,
 'rea

In [4]:
# How are these vectors stored?
X_test_vect[0]

<1x8241 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [5]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [6]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [7]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [8]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.809 / Accuracy: 0.974
