# Building ML Classifiers: Create Gradient Boosting model

### Read in & clean text

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [2]:
# TF-IDF
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
X_tfidf_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vect.get_feature_names_out())], axis=1)
X_tfidf_feat.head()

Unnamed: 0,body_len,punct%,Unnamed: 3,0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,...,zindgi,zoe,zogtoriu,zoom,zouk,zyada,é,ü,üll,〨ud
0,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,4.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['body_text'])
X_count_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_count.toarray(), columns=count_vect.get_feature_names_out())], axis=1)
X_count_feat.head()

Unnamed: 0,body_len,punct%,Unnamed: 3,0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,...,zindgi,zoe,zogtoriu,zoom,zouk,zyada,é,ü,üll,〨ud
0,128,4.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,49,4.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,62,3.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,28,7.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,135,4.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Explore GradientBoostingClassifier Attributes & Hyperparameters

In [4]:
from sklearn.ensemble import GradientBoostingClassifier

In [5]:
print(dir(GradientBoostingClassifier))

['_SUPPORTED_LOSS', '__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_check_feature_names', '_check_initialized', '_check_n_features', '_check_params', '_clear_state', '_compute_partial_dependence_recursion', '_estimator_type', '_fit_stage', '_fit_stages', '_get_param_names', '_get_tags', '_init_state', '_is_initialized', '_make_estimator', '_more_tags', '_raw_predict', '_raw_predict_init', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_resize_state', '_staged_raw_predict', '_validate_data', '_validate_estimator', '_validate_y', '_warn_ma

Attributes and methods are almost the same as they are with Random Forest.  
  
As for default attributes passed in *GrdientBoostingClassifier* when creating its object, `max_depth=3` and `n_estimators=100`. In *RandomForestClassifier* it is None and 10, because it is built with a couple of fully grown trees, whereas GradientBoostingClassifier uses a lot of very basic trees.  

Also there is no `n_jobs` parameters which used for parallelizing training in *RandomForestClassifier* (n_jobs=-1).  
  
Additional parameter `learning_rate=0.1` determines how quickly an algorithm optimizes, but it also has performance implications, because it can cause the model to optimize too quickly, without truly finding the best model.

### Test hyperparameters with Grid-search

In [6]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_feat, data['label'], test_size=0.2)

In [8]:
results = pd.DataFrame()

def train_GB(n_est, max_depth, lr):
    gb = GradientBoostingClassifier(n_estimators=n_est, max_depth=max_depth, learning_rate=lr)
    gb_model = gb.fit(X_train, y_train)
    y_pred = gb_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')
    accuracy = round((y_pred==y_test).sum() / len(y_pred), 3)

    test_result = {
        'n_estimators': n_est, 
        'max_depth': max_depth, 
        'learning_rate': lr, 
        'precision': precision, 
        'recall': recall,
        'accuracy': accuracy
    }

    global results
    results = results.append(test_result, ignore_index=True)

In [9]:
for n_est in [50, 100, 150]:
    for max_depth in [3, 7, 11, 15]:
        for lr in [0.01, 0.1, 1]:
            train_GB(n_est, max_depth, lr)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Warnings tell that 3 of the models didn't predict a single text message to be spam, because of that precision could't be calculated and was set to zero.

##### Poorly performing models

In [10]:
results.sort_values(by=['accuracy'])[0:10]

Unnamed: 0,n_estimators,max_depth,learning_rate,precision,recall,accuracy
0,50.0,3.0,0.01,0.0,0.0,0.884
3,50.0,7.0,0.01,0.0,0.0,0.884
6,50.0,11.0,0.01,0.0,0.0,0.884
9,50.0,15.0,0.01,1.0,0.007752,0.885
12,100.0,3.0,0.01,0.952381,0.465116,0.935
24,150.0,3.0,0.01,0.954545,0.488372,0.938
15,100.0,7.0,0.01,0.903614,0.581395,0.944
30,150.0,11.0,0.01,0.873684,0.643411,0.948
27,150.0,7.0,0.01,0.909091,0.620155,0.949
21,100.0,15.0,0.01,0.891304,0.635659,0.949


As we see, the worst performing models have low learning_rate, also a low n_estimators number might have some impact

##### Best performing models

In [11]:
results.sort_values(by=['accuracy'], ascending=False)[0:10]

Unnamed: 0,n_estimators,max_depth,learning_rate,precision,recall,accuracy
25,150.0,3.0,0.1,0.940594,0.736434,0.964
23,100.0,15.0,1.0,0.94,0.728682,0.963
19,100.0,11.0,0.1,0.891892,0.767442,0.962
31,150.0,11.0,0.1,0.877193,0.775194,0.961
34,150.0,15.0,0.1,0.882883,0.75969,0.961
22,100.0,15.0,0.1,0.882883,0.75969,0.961
28,150.0,7.0,0.1,0.883929,0.767442,0.961
7,50.0,11.0,0.1,0.882883,0.75969,0.961
29,150.0,7.0,1.0,0.903846,0.728682,0.96
13,100.0,3.0,0.1,0.946809,0.689922,0.96


Based on the results here the best performing models have learning_rate 0.1 and high number of n_estimators.

### Evaluate Gradient Boosting model performance using Grid-search and Cross-validation

*Grid-search*: Exhaustively search all parameter combinations in a given grid to determine the best model.  
*Cross-validation*: Divide a dataset into k subsets and repeat the holdout method k times where a different subset is used as the holdout set in each iteration.

In [12]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

We are going to pass best hyperparameters determined in previous step using only Grid-search

#### TF-IDF models

In [13]:
gb = GradientBoostingClassifier()
param = {
    'n_estimators': [100, 150],
    'max_depth': [7, 11, 15],
    'learning_rate': [0.1] # default value, could be skipped
}
gs = GridSearchCV(gb, param, cv=5, n_jobs=-1)
# n_jobs=-1 means that we're going to train models on different subsets and parameter settings in parallel,
# it is impossible to train sub-models in the same Gradient Boosting model, 
# because they are trained iteratively and each iteration depends on prior iteration

tfidf_cv_fit = gs.fit(X_tfidf_feat, data['label'])
pd.DataFrame(tfidf_cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,332.295099,4.677119,0.206143,0.054555,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.966786,0.977558,0.969452,0.971249,0.966757,0.97036,0.00398,1
1,199.548763,4.430043,0.274043,0.015876,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.969479,0.979354,0.969452,0.96496,0.966757,0.97,0.00498,2
3,321.780042,16.437967,0.450768,0.107292,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.965889,0.977558,0.97035,0.97035,0.96496,0.969821,0.004461,3
4,260.315121,3.449122,0.345161,0.05052,0.1,15,100,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.963196,0.974865,0.967655,0.968553,0.968553,0.968565,0.003724,4
0,128.354702,0.72414,0.275031,0.018426,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.962298,0.974865,0.966757,0.96496,0.967655,0.967307,0.004199,5


We see that the `mean_fit_time` is significantly larger than for Random Forest where the most consuming model took around 30 seconds to fit.  
  
All Gradient Boosting models getting perfect `mean_train_score` of 1.0 on the training set (not displayed). If the model is overfitting ot the point of just memorizing the training set that's bad because it won't do well generalizing to the test set.  
That's why we really care only about the test score, that's what tells us whether model can generalize to data that it was not trained on.  
  
Best model here uses max_depth 15 and 150 estimators.

#### CountVectorizer models

In [14]:
gb = GradientBoostingClassifier()
param= {
    'n_estimators': [100, 150],
    'max_depth': [7, 11, 15],
    'learning_rate': [0.1] # default value, could be skipped
}
gs = GridSearchCV(gb, param, cv=5, n_jobs=-1)
# n_jobs=-1 means that we're going to train models on different subsets and parameter settings in parallel,
# it is impossible to train sub-models in the same Gradient Boosting model, 
# because they are trained iteratively and each iteration depends on prior iteration

count_cv_fit = gs.fit(X_count_feat, data['label'])
pd.DataFrame(count_cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,294.314402,4.696841,0.350255,0.059377,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.965889,0.976661,0.969452,0.96496,0.973046,0.970001,0.004388,1
4,238.108096,11.34925,0.303271,0.018349,0.1,15,100,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.962298,0.977558,0.968553,0.964061,0.971249,0.968744,0.005431,2
5,316.758846,3.951857,0.208862,0.02577,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.964093,0.976661,0.969452,0.962264,0.971249,0.968744,0.005159,3
1,205.2492,3.941392,0.313091,0.036486,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.964991,0.979354,0.971249,0.963163,0.96496,0.968743,0.005973,4
2,195.55699,4.022713,0.287574,0.018015,0.1,11,100,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.964991,0.974865,0.965858,0.962264,0.969452,0.967486,0.004347,5


Results of using count vectorizer are very similar to tfidf (best model with max_depth=11 and n_estimators=150), but test score is slightly lower.