### Gradient boosting model with grid-search

In [78]:
import nltk
import pandas as pd
import string 
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split, GridSearchCV

In [65]:
# criando filtro de palavras irrelavantes, PorterStemmer e o data
stopword = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
data = pd.read_table('SMSSpamCollection.tsv', names=['S_or_H', 'Text'])

In [66]:
# funcao para contar o numero de pontucoes e a porcentagem
def count_punc(text):
    count = sum([1 for punc in text if punc in string.punctuation])
    por_count = round(count / (len(text) - text.count(' ')),3)*100
    return por_count

In [67]:
# criando 2 colunas, uma de total de caracter e outra de porcentagem de pontuacao da msg
data['Total_char'] = data.Text.apply(lambda x: len(x) - x.count(' '))
data['Por_punc'] = data.Text.apply(lambda x: count_punc(x))
data.head(2)

Unnamed: 0,S_or_H,Text,Total_char,Por_punc
0,ham,I've been searching for the right words to tha...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7


In [68]:
# criar a funçnao para limpar o text, tirar espaços e pontuacao, tokenize, stopword, stemmer
# na etapa de stopword e stemmer add um espaco com o join para trasformar tudo em texto limpo
def clean_text(text):
    text = ''.join([word for word in text if word not in string.punctuation])
    tokenize = re.split('\W+', text)
    text = ' '.join([ps.stem(word) for word in tokenize if word not in stopword])
    return text

In [80]:
# usando o Tfidfvector para analizar
# TF-IDF
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data.Text)
X_tfidf_features = pd.concat([data['Total_char'], data['Por_punc'], pd.DataFrame(X_tfidf.toarray())],axis=1)
X_tfidf_features.head(2)


Unnamed: 0,Total_char,Por_punc,0,1,2,3,4,5,6,7,...,34,35,36,37,38,39,40,41,42,43
0,160,2.5,0.547168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.055866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.46241,0.319465,0.316844,0.295934,0.0,0.065076,0.201945,0.0,...,0.114424,0.036482,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
# usando o CountVectorizer para analizar
#CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data.Text)
X_count_features = pd.concat([data['Total_char'],data['Por_punc'], pd.DataFrame(X_count.toarray())],axis=1)
X_count_features.head(2)

Unnamed: 0,Total_char,Por_punc,0,1,2,3,4,5,6,7,...,34,35,36,37,38,39,40,41,42,43
0,160,2.5,17,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,128,4.7,22,5,5,5,0,1,3,0,...,2,1,0,0,0,0,0,0,0,0


### Gradient Boosting Classifier

In [70]:
dir(GradientBoostingClassifier)

['_SUPPORTED_LOSS',
 '__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_feature_names',
 '_check_initialized',
 '_check_n_features',
 '_check_params',
 '_clear_state',
 '_compute_partial_dependence_recursion',
 '_estimator_type',
 '_fit_stage',
 '_fit_stages',
 '_get_param_names',
 '_get_tags',
 '_init_state',
 '_is_initialized',
 '_make_estimator',
 '_more_tags',
 '_raw_predict',
 '_raw_predict_init',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_resize_state',
 '_staged_raw_predict',
 '_valid

In [71]:
# montando os trainos e os testes
X_train, X_test, y_train, y_test = train_test_split(X_features, data['S_or_H'], test_size=0.2)
print(X_train[0:2])
print('______________________')
print(X_test[0:2])
print('______________________')
print(y_train[0:2])
print('______________________')
print(y_test[0:2])

      Total_char  Por_punc         0    1    2    3    4    5    6    7  ...  \
3082          11       0.0  0.598277  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   
717           23       4.3  0.289157  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   

       34   35   36   37   38   39   40   41   42   43  
3082  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
717   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[2 rows x 46 columns]
______________________
      Total_char  Por_punc         0    1    2    3    4    5    6    7  ...  \
2607          40      22.5  0.423887  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   
595           71       4.2  0.563644  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   

            34        35   36   37   38   39   40   41   42   43  
2607  0.000000  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
595   0.139475  0.088938  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[2 rows x 46 columns]
______________________
3082    ham
717     ham
Name: S_or_H, dtype: object
__

  print(y_train[0:2])
  print(y_test[0:2])


### building a grad-search

In [72]:
# criando a funcao que ira gerar cada linha no loopfor com o valor estimado,
# profundidade e o valor de precisao
def train_GB(n_est, max_depth, lr):
    gb = GradientBoostingClassifier(n_estimators=n_est,max_depth=max_depth,learning_rate=lr)
    gb_model = gb.fit(X_train, y_train)
    y_pred = gb_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')
    print(f'Estimate {n_est}')
    print(f'Depth {max_depth}')
    print(f'Precision{precision}')
    print(f'Recall {recall}')
    print(f'numberOfSpaminSpam {(y_pred == y_test).sum()}')
    print(f'numberOfSpamOutOfSpam {(y_pred != y_test).sum()}' )
    print('---------------------------------------------' )


In [73]:
# criando o loop para fazer o grad-search
for n_est in [50,100,155]:
    for max_depth in [3,7,11,15]:
        for lr in [0.01, 0.1, 1]:
            train_GB(n_est,max_depth,lr)

  _warn_prf(average, modifier, msg_start, len(result))


Estimate 50
Depth 3
Precision0.0
Recall 0.0
numberOfSpaminSpam 957
numberOfSpamOutOfSpam 157
---------------------------------------------




Estimate 50
Depth 3
Precision0.9626865671641791
Recall 0.821656050955414
numberOfSpaminSpam 1081
numberOfSpamOutOfSpam 33
---------------------------------------------




Estimate 50
Depth 3
Precision0.9225352112676056
Recall 0.8343949044585988
numberOfSpaminSpam 1077
numberOfSpamOutOfSpam 37
---------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))


Estimate 50
Depth 7
Precision0.0
Recall 0.0
numberOfSpaminSpam 957
numberOfSpamOutOfSpam 157
---------------------------------------------




Estimate 50
Depth 7
Precision0.948905109489051
Recall 0.8280254777070064
numberOfSpaminSpam 1080
numberOfSpamOutOfSpam 34
---------------------------------------------




Estimate 50
Depth 7
Precision0.9379310344827586
Recall 0.8662420382165605
numberOfSpaminSpam 1084
numberOfSpamOutOfSpam 30
---------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))


Estimate 50
Depth 11
Precision0.0
Recall 0.0
numberOfSpaminSpam 957
numberOfSpamOutOfSpam 157
---------------------------------------------




Estimate 50
Depth 11
Precision0.9420289855072463
Recall 0.8280254777070064
numberOfSpaminSpam 1079
numberOfSpamOutOfSpam 35
---------------------------------------------




Estimate 50
Depth 11
Precision0.910958904109589
Recall 0.8471337579617835
numberOfSpaminSpam 1077
numberOfSpamOutOfSpam 37
---------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))


Estimate 50
Depth 15
Precision0.0
Recall 0.0
numberOfSpaminSpam 957
numberOfSpamOutOfSpam 157
---------------------------------------------




Estimate 50
Depth 15
Precision0.9054054054054054
Recall 0.8535031847133758
numberOfSpaminSpam 1077
numberOfSpamOutOfSpam 37
---------------------------------------------




Estimate 50
Depth 15
Precision0.9375
Recall 0.8598726114649682
numberOfSpaminSpam 1083
numberOfSpamOutOfSpam 31
---------------------------------------------




Estimate 100
Depth 3
Precision0.976
Recall 0.7770700636942676
numberOfSpaminSpam 1076
numberOfSpamOutOfSpam 38
---------------------------------------------




Estimate 100
Depth 3
Precision0.9562043795620438
Recall 0.8343949044585988
numberOfSpaminSpam 1082
numberOfSpamOutOfSpam 32
---------------------------------------------




Estimate 100
Depth 3
Precision0.7103825136612022
Recall 0.8280254777070064
numberOfSpaminSpam 1034
numberOfSpamOutOfSpam 80
---------------------------------------------




Estimate 100
Depth 7
Precision0.9692307692307692
Recall 0.802547770700637
numberOfSpaminSpam 1079
numberOfSpamOutOfSpam 35
---------------------------------------------




Estimate 100
Depth 7
Precision0.9424460431654677
Recall 0.8343949044585988
numberOfSpaminSpam 1080
numberOfSpamOutOfSpam 34
---------------------------------------------




Estimate 100
Depth 7
Precision0.9428571428571428
Recall 0.8407643312101911
numberOfSpaminSpam 1081
numberOfSpamOutOfSpam 33
---------------------------------------------




Estimate 100
Depth 11
Precision0.9407407407407408
Recall 0.8089171974522293
numberOfSpaminSpam 1076
numberOfSpamOutOfSpam 38
---------------------------------------------




Estimate 100
Depth 11
Precision0.9420289855072463
Recall 0.8280254777070064
numberOfSpaminSpam 1079
numberOfSpamOutOfSpam 35
---------------------------------------------




Estimate 100
Depth 11
Precision0.9115646258503401
Recall 0.8535031847133758
numberOfSpaminSpam 1078
numberOfSpamOutOfSpam 36
---------------------------------------------




Estimate 100
Depth 15
Precision0.9154929577464789
Recall 0.8280254777070064
numberOfSpaminSpam 1075
numberOfSpamOutOfSpam 39
---------------------------------------------




Estimate 100
Depth 15
Precision0.9047619047619048
Recall 0.8471337579617835
numberOfSpaminSpam 1076
numberOfSpamOutOfSpam 38
---------------------------------------------




Estimate 100
Depth 15
Precision0.9428571428571428
Recall 0.8407643312101911
numberOfSpaminSpam 1081
numberOfSpamOutOfSpam 33
---------------------------------------------




Estimate 155
Depth 3
Precision0.9692307692307692
Recall 0.802547770700637
numberOfSpaminSpam 1079
numberOfSpamOutOfSpam 35
---------------------------------------------




Estimate 155
Depth 3
Precision0.9632352941176471
Recall 0.8343949044585988
numberOfSpaminSpam 1083
numberOfSpamOutOfSpam 31
---------------------------------------------




Estimate 155
Depth 3
Precision0.7103825136612022
Recall 0.8280254777070064
numberOfSpaminSpam 1034
numberOfSpamOutOfSpam 80
---------------------------------------------




Estimate 155
Depth 7
Precision0.9481481481481482
Recall 0.8152866242038217
numberOfSpaminSpam 1078
numberOfSpamOutOfSpam 36
---------------------------------------------




Estimate 155
Depth 7
Precision0.9424460431654677
Recall 0.8343949044585988
numberOfSpaminSpam 1080
numberOfSpamOutOfSpam 34
---------------------------------------------




Estimate 155
Depth 7
Precision0.9448275862068966
Recall 0.8726114649681529
numberOfSpaminSpam 1086
numberOfSpamOutOfSpam 28
---------------------------------------------




Estimate 155
Depth 11
Precision0.935251798561151
Recall 0.8280254777070064
numberOfSpaminSpam 1078
numberOfSpamOutOfSpam 36
---------------------------------------------




Estimate 155
Depth 11
Precision0.9558823529411765
Recall 0.8280254777070064
numberOfSpaminSpam 1081
numberOfSpamOutOfSpam 33
---------------------------------------------




Estimate 155
Depth 11
Precision0.9097222222222222
Recall 0.8343949044585988
numberOfSpaminSpam 1075
numberOfSpamOutOfSpam 39
---------------------------------------------




Estimate 155
Depth 15
Precision0.9166666666666666
Recall 0.8407643312101911
numberOfSpaminSpam 1077
numberOfSpamOutOfSpam 37
---------------------------------------------




Estimate 155
Depth 15
Precision0.9054054054054054
Recall 0.8535031847133758
numberOfSpaminSpam 1077
numberOfSpamOutOfSpam 37
---------------------------------------------
Estimate 155
Depth 15
Precision0.948905109489051
Recall 0.8280254777070064
numberOfSpaminSpam 1080
numberOfSpamOutOfSpam 34
---------------------------------------------




In [81]:
# aqui vou criar para usar a Tfidfvector
# nesse caso criamos uma variavel com um dicinario para passar para o metodo
gb2 =GradientBoostingClassifier()
param = {
    'n_estimators': [100,150],
    'max_depth': [7,11,15],
    'learning_rate': [0.1],
}
gs = GridSearchCV(gb2, param, cv=5, n_jobs=-1)
cv_fit = gs.fit(X_tfidf_features, data['S_or_H'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:3]



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,10.640484,0.222454,0.011601,0.002551,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.973968,0.982944,0.973968,0.975741,0.978437,0.977012,0.003387,1
0,7.069938,0.21635,0.011138,0.002159,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.975763,0.982944,0.973968,0.974843,0.975741,0.976652,0.003215,2
3,15.549178,0.291898,0.013159,0.004078,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.975763,0.980251,0.971275,0.973944,0.97664,0.975575,0.002973,3


In [82]:
# aqui vou criar para usar o CountVectorizer
# nesse caso criamos uma variavel com um dicinario para passar para o metodo
gb2 =GradientBoostingClassifier()
param = {
    'n_estimators': [100,150],
    'max_depth': [7,11,15],
    'learning_rate': [0.1],
}
gs = GridSearchCV(gb2, param, cv=5, n_jobs=-1)
cv_fit = gs.fit(X_count_features, data['S_or_H'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:3]



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,6.299937,0.109509,0.010252,0.002163,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.979354,0.981149,0.974865,0.977538,0.979335,0.978448,0.002124,1
1,4.121401,0.124852,0.01081,0.00315,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.979354,0.977558,0.977558,0.978437,0.97664,0.977909,0.000919,2
2,4.168792,0.146312,0.009276,0.002788,0.1,11,100,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.978456,0.978456,0.975763,0.97664,0.978437,0.97755,0.001136,3
