# Building ML Classifiers: Random Forest on a holdout test set

### Read in & clean text

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])

X_features = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
# we're creating a data frame X_features that does not include the label
X_features.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8094,8095,8096,8097,8098,8099,8100,8101,8102,8103
0,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,4.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Explore RandomForestClassifier trough Holdout Set

1. SPLIT DATASET TO TRAIN AND TEST SETS

In [2]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)
# commonly used test set size is 20% of original dataset

2. TRAIN MODEL

In [4]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)

rf_model = rf.fit(X_train, y_train)



In [5]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[:10]
# 10 most important features

[(0.0916278268044636, 'body_len'),
 (0.04177451358436703, 7350),
 (0.03378310491000374, 1803),
 (0.026633837613326838, 2031),
 (0.023133634865009557, 3134),
 (0.022880066166507732, 6746),
 (0.022568145375836627, 4796),
 (0.01959748285594811, 5078),
 (0.017834219390027384, 7218),
 (0.01654890009291131, 6285)]

3. PREDICT

In [6]:
y_pred = rf.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')



In [7]:
accuracy = round((y_pred==y_test).sum() / len(y_pred), 3)
print(f'''
Precision: {round(precision, 3)} 
Recall: {round(recall, 3)} 
Accuracy: {accuracy}
''')


Precision: 1.0 
Recall: 0.625 
Accuracy: 0.954



![precision_recall_accuracy.png](precision_recall_accuracy.png)

The amount of spam not identified as spam tells us that the model isn't quite aggressive enough in identifying spam.
Grid search can be used to test different hyperparameter settings to make our model more agressive