# Building Machine Learning Classifiers: Model selection

### Read in & clean text

In [1]:
import pandas as pd
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [2]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data['text_len'] = data['text'].apply(lambda x: len(x) - x.count(" "))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

### Split into train/test


In [3]:
from sklearn.model_selection import train_test_split
y= data.target.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(data[['text', 'text_len']], y, test_size=0.2)
print(X_train.shape)
print(y_train.shape)

(6090, 2)
(6090,)


### Vectorize

In [4]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['text'])

tfidf_train = tfidf_vect_fit.transform(X_train['text'])
tfidf_test = tfidf_vect_fit.transform(X_test['text'])

X_train_vect = pd.concat([X_train[['text_len']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['text_len']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

X_train_vect.head()

Unnamed: 0,text_len,0,1,2,3,4,5,6,7,8,...,16479,16480,16481,16482,16483,16484,16485,16486,16487,16488
0,82,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,78,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Evaluation model

In [5]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

### Build our own Grid-search

In [11]:
def train_GB(est, max_depth, lr):
    gb = GradientBoostingClassifier(n_estimators=est, max_depth=max_depth, learning_rate=lr)
    gb_model = gb.fit(X_train_vect, y_train)
    y_pred = gb_model.predict(X_test_vect)
    precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average='binary')
    print('Est: {} / Depth: {} / LR: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        est, max_depth, lr, round(precision, 3), round(recall, 3), 
        round((y_pred==y_test).sum()/len(y_pred), 3)))

In [12]:
for n_est in [50, 100, 150]:
    for max_depth in [3, 7, 11, 15]:
        for lr in [0.01, 0.1, 1]:
            train_GB(n_est, max_depth, lr)

Est: 50 / Depth: 3 / LR: 0.01 ---- Precision: 0.916 / Recall: 0.144 / Accuracy: 0.611
Est: 50 / Depth: 3 / LR: 0.1 ---- Precision: 0.858 / Recall: 0.435 / Accuracy: 0.715
Est: 50 / Depth: 3 / LR: 1 ---- Precision: 0.75 / Recall: 0.669 / Accuracy: 0.752
Est: 50 / Depth: 7 / LR: 0.01 ---- Precision: 0.869 / Recall: 0.224 / Accuracy: 0.638
Est: 50 / Depth: 7 / LR: 0.1 ---- Precision: 0.845 / Recall: 0.56 / Accuracy: 0.757
Est: 50 / Depth: 7 / LR: 1 ---- Precision: 0.713 / Recall: 0.686 / Accuracy: 0.736
Est: 50 / Depth: 11 / LR: 0.01 ---- Precision: 0.86 / Recall: 0.305 / Accuracy: 0.666
Est: 50 / Depth: 11 / LR: 0.1 ---- Precision: 0.837 / Recall: 0.609 / Accuracy: 0.772
Est: 50 / Depth: 11 / LR: 1 ---- Precision: 0.727 / Recall: 0.692 / Accuracy: 0.746
Est: 50 / Depth: 15 / LR: 0.01 ---- Precision: 0.859 / Recall: 0.34 / Accuracy: 0.68
Est: 50 / Depth: 15 / LR: 0.1 ---- Precision: 0.823 / Recall: 0.655 / Accuracy: 0.783
Est: 50 / Depth: 15 / LR: 1 ---- Precision: 0.736 / Recall: 0.666 /

### Random Forest model

In [13]:
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(X_train_vect, y_train)
    y_pred = rf_model.predict(X_test_vect)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label=1, average='binary')
    print('Est: {} / Depth: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        n_est, depth, round(precision, 3), round(recall, 3),
        round((y_pred==y_test).sum() / len(y_pred), 3)))

In [15]:
for n_est in [10, 50, 100,150]:
    for depth in [10, 20, 30, 40, None]:
        train_RF(n_est, depth)

Est: 10 / Depth: 10 ---- Precision: 0.938 / Recall: 0.177 / Accuracy: 0.626
Est: 10 / Depth: 20 ---- Precision: 0.88 / Recall: 0.345 / Accuracy: 0.685
Est: 10 / Depth: 30 ---- Precision: 0.895 / Recall: 0.412 / Accuracy: 0.715
Est: 10 / Depth: 40 ---- Precision: 0.883 / Recall: 0.465 / Accuracy: 0.733
Est: 10 / Depth: None ---- Precision: 0.803 / Recall: 0.641 / Accuracy: 0.769
Est: 50 / Depth: 10 ---- Precision: 0.991 / Recall: 0.155 / Accuracy: 0.621
Est: 50 / Depth: 20 ---- Precision: 0.983 / Recall: 0.33 / Accuracy: 0.697
Est: 50 / Depth: 30 ---- Precision: 0.951 / Recall: 0.4 / Accuracy: 0.722
Est: 50 / Depth: 40 ---- Precision: 0.926 / Recall: 0.46 / Accuracy: 0.742
Est: 50 / Depth: None ---- Precision: 0.805 / Recall: 0.689 / Accuracy: 0.786
Est: 100 / Depth: 10 ---- Precision: 0.992 / Recall: 0.172 / Accuracy: 0.628
Est: 100 / Depth: 20 ---- Precision: 0.979 / Recall: 0.343 / Accuracy: 0.703
Est: 100 / Depth: 30 ---- Precision: 0.951 / Recall: 0.399 / Accuracy: 0.722
Est: 100 /

In [16]:
for n_est in [10, 50, 100,150,200,250,300]:
    for depth in [ None]:
        train_RF(n_est, depth)

Est: 10 / Depth: None ---- Precision: 0.821 / Recall: 0.628 / Accuracy: 0.772
Est: 50 / Depth: None ---- Precision: 0.811 / Recall: 0.672 / Accuracy: 0.783
Est: 100 / Depth: None ---- Precision: 0.813 / Recall: 0.68 / Accuracy: 0.787
Est: 150 / Depth: None ---- Precision: 0.817 / Recall: 0.694 / Accuracy: 0.793
Est: 200 / Depth: None ---- Precision: 0.822 / Recall: 0.699 / Accuracy: 0.798
Est: 250 / Depth: None ---- Precision: 0.824 / Recall: 0.694 / Accuracy: 0.796
Est: 300 / Depth: None ---- Precision: 0.814 / Recall: 0.688 / Accuracy: 0.79
