### building machine learning classifiers building a basic randon forest model  

In [100]:
import pandas as pd
import re
import string
import nltk
# metodos para vetorizar
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# metodos para classificar 
from sklearn.ensemble import RandomForestClassifier
# cross validation
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
# importanto metodos para separar e treinar e testar os dados
from sklearn.metrics import precision_recall_fscore_support as score 
from sklearn.model_selection import train_test_split

In [74]:
# criando o data e preparando os filtros
data = pd.read_table('SMSSpamCollection.tsv', names=['H_or_S', 'Text'])
stopword = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

In [75]:
# criando funcao para gerar a porcentagem de pontuacao nos textos
def por_punc(text):
    sum_punc = sum([1 for punc in text if punc in string.punctuation])
    calc_por = round(sum_punc/(len(text) - text.count(' ')),3)*100
    return calc_por

In [76]:
# armazenando o calculo da porcentagem e armazenado o valor total de caracter de cada msg
data['punc_por%'] = data.Text.apply(lambda x: por_punc(x))
data['char_total'] = data.Text.apply(lambda x: len(x) - x.count(' '))
data.head()

Unnamed: 0,H_or_S,Text,punc_por%,char_total
0,ham,I've been searching for the right words to tha...,2.5,160
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,4.7,128
2,ham,"Nah I don't think he goes to usf, he lives aro...",4.1,49
3,ham,Even my brother is not like to speak with me. ...,3.2,62
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,7.1,28


In [77]:
# criando a fucnao para limpar os espacos, tokenize, stopwords, stimming
def clean_text(text):
    text = ''.join([word for word in text if word is not string.punctuation])
    tokenize = re.split('\W+', text)
    text = ' '.join([ps.stem(word) for word in tokenize if word not in stopword])
    return text
data.Text = data.Text.apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,H_or_S,Text,punc_por%,char_total
0,ham,i search right word thank breather i promis wo...,2.5,160
1,spam,free entri 2 wkli comp win fa cup final tkt 21...,4.7,128
2,ham,nah i think goe usf live around though,4.1,49
3,ham,even brother like speak they treat like aid pa...,3.2,62
4,ham,i have a date on sunday with will,7.1,28


In [78]:
# agora vamos vetorizar o texto 
tfid_vect = TfidfVectorizer(analyzer=clean_text)
X_tfid_fit = tfid_vect.fit_transform(data.Text)

In [79]:
# criando um novo dataframe sem o label(Spam,Ham) e concatenado as colunas
X_features = pd.concat([data['char_total'], data['punc_por%'], pd.DataFrame(X_tfid_fit.toarray())], axis=1)
X_features.head()

Unnamed: 0,char_total,punc_por%,0,1,2,3,4,5,6,7,...,35,36,37,38,39,40,41,42,43,44
0,160,2.5,0.518939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.492625,0.312595,0.31003,0.289569,0.0,0.063677,0.197602,0.0,...,0.111963,0.038934,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,4.1,0.472849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,62,3.2,0.474479,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28,7.1,0.372023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.352825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [99]:
# agora vamos vetorizar o texto de outra forma
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data.Text)
# criando um novo dataframe sem o label(Spam,Ham) e concatenado as colunas
X_count_feat = pd.concat([data['char_total'],data['punc_por%'], pd.DataFrame(X_count.toarray())],axis=1)
X_count_feat.head()

Unnamed: 0,char_total,punc_por%,0,1,2,3,4,5,6,7,...,35,36,37,38,39,40,41,42,43,44
0,160,2.5,15,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,128,4.7,24,5,5,5,0,1,3,0,...,2,1,0,0,0,0,0,0,0,0
2,49,4.1,6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,62,3.2,8,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,28,7.1,2,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [80]:
dir(RandomForestClassifier)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_feature_names',
 '_check_n_features',
 '_compute_oob_predictions',
 '_estimator_type',
 '_get_oob_predictions',
 '_get_param_names',
 '_get_tags',
 '_make_estimator',
 '_more_tags',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_set_oob_score_and_attributes',
 '_validate_X_predict',
 '_validate_data',
 '_validate_estimator',
 '_validate_y_class_weight',
 'apply',
 'decision_path',
 'feature_importances_',
 'fit',
 'get_params',
 'n_features_',


In [81]:
RandomForestClassifier()

In [82]:
# criandeo um objeto do RandomForestClassifier
# vamos colocar o argumento com -1 para rodar mais rapido e construir as 
# arvores de deciçnao em pararlelo
rf = RandomForestClassifier(n_jobs=-1)

In [83]:
# esse metodo Kfold usamos para dividir nosso data em quantas partes queremos analizar 
# ele vai intercalando as interações 
# aqui precisamos passar em quantos galhos queremos dividir a arvore
k_fols = KFold(n_splits=5)

In [84]:
# esse methoda nos dara o resultado da analise
cross_val_score(rf, X_features, data['H_or_S'],cv=k_fols, scoring='accuracy', n_jobs=-1)



array([0.97755835, 0.98294434, 0.97935368, 0.97663971, 0.98113208])

### Explore RandomForestClassifier through holdout set

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X_features, data['H_or_S'], test_size=0.2)

In [86]:
rf2 = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf2.fit(X_train, y_train)
rf_model



In [87]:
# vamos usar o methodo zip para combinar os 2 arrays
# usaremos tbm o sorted para organizar em ordem de importancia 
# e colocamos o Reverse=True para que fique do mais importanta para o menos importante
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)

[(0.17620389236016776, 1),
 (0.1305622405035905, 2),
 (0.08309849965524764, 9),
 (0.08251020907248278, 6),
 (0.06820818327011519, 7),
 (0.05344920705816723, 3),
 (0.05042069195065399, 8),
 (0.04605954906034143, 4),
 (0.03903147333710101, 'char_total'),
 (0.031925395795587626, 5),
 (0.026095436517767658, 35),
 (0.01887463328827279, 10),
 (0.01856342618350757, 0),
 (0.013385983244275194, 14),
 (0.012133923687929931, 19),
 (0.010956089163197479, 27),
 (0.008229035563828544, 29),
 (0.007942485603610971, 16),
 (0.007547667764560313, 32),
 (0.0075040523826247554, 26),
 (0.007356915619168685, 20),
 (0.007237383690345847, 'punc_por%'),
 (0.007237341685402177, 30),
 (0.006974169033996593, 12),
 (0.00682778231876628, 36),
 (0.006448034363148779, 18),
 (0.006400228164520055, 31),
 (0.00625163050743149, 15),
 (0.006117349313562828, 24),
 (0.006031654350514077, 34),
 (0.005983359323637534, 23),
 (0.005963042481045351, 22),
 (0.005825314162964062, 25),
 (0.00570000647966034, 17),
 (0.005253190348326

In [88]:
# vamos usar a funcao .predict para prever o y usando apenas o X_test
y_pred = rf_model.predict(X_test)



In [89]:
# usando o score vamos gerar 4 variaveis, a partir do y_test e do y_pred passando como alvo o Spam
precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')

In [93]:
# agora vamos imprimir os valores de precisao, do recall e Accurancy, qual apurado está o modelo
# fazemos um test boleano para ver qual acertamos e qual não e com a soma saberemos o numero
# de acertos que conseguimos entre o teste e a previsao
# precisao todos que foram corretamente pra caixa de Spam
# Recall todos os spams que foram pra caixa e a diferença sao os spans que nao foram pra caixa
# accurancy é a porcentagem de acerto 
# target foram os spams que foram pra caixa de spam
# erro sao os spams que passaram batido
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'Accurancy: {(y_pred==y_test).sum() / len(y_pred)}')
print(f'Target: {(y_pred==y_test).sum()}')
print(f'Error: {(y_pred!=y_test).sum()}')

Precision: 0.9318181818181818
Recall: 0.8785714285714286
Accurancy: 0.9766606822262118
Target: 1088
Error: 26


### Build or own Grid search

In [96]:
# criando a funcao que ira criar e avaliar toda interaçnao que será feita no loopfor abaixo 
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam',average='binary')
    print(f'Est: {n_est}')
    print(f'Depth: {depth}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'Accurancy: {(y_pred==y_test).sum() / len(y_pred)}')
    print(f'Target: {(y_pred==y_test).sum()}')
    print(f'Error: {(y_pred!=y_test).sum()}')
    print(f'____________________________________________________')

In [97]:
# criando o grid com a funcao criada acima
for n_est in [10, 50, 100]:
    for depth in [10, 20, 30 ,None]:
        train_RF(n_est, depth)




Est: 10
Depth: 10
Precision: 0.9453125
Recall: 0.8642857142857143
Accurancy: 0.9766606822262118
Target: 1088
Error: 26
____________________________________________________
Est: 10
Depth: 20
Precision: 0.9606299212598425
Recall: 0.8714285714285714
Accurancy: 0.9793536804308797
Target: 1091
Error: 23
____________________________________________________
Est: 10
Depth: 30
Precision: 0.9104477611940298
Recall: 0.8714285714285714
Accurancy: 0.9730700179533214
Target: 1084
Error: 30
____________________________________________________
Est: 10
Depth: None
Precision: 0.952755905511811
Recall: 0.8642857142857143
Accurancy: 0.9775583482944344
Target: 1089
Error: 25
____________________________________________________




Est: 50
Depth: 10
Precision: 0.9461538461538461
Recall: 0.8785714285714286
Accurancy: 0.9784560143626571
Target: 1090
Error: 24
____________________________________________________
Est: 50
Depth: 20
Precision: 0.9457364341085271
Recall: 0.8714285714285714
Accurancy: 0.9775583482944344
Target: 1089
Error: 25
____________________________________________________




Est: 50
Depth: 30
Precision: 0.976
Recall: 0.8714285714285714
Accurancy: 0.981149012567325
Target: 1093
Error: 21
____________________________________________________
Est: 50
Depth: None
Precision: 0.9253731343283582
Recall: 0.8857142857142857
Accurancy: 0.9766606822262118
Target: 1088
Error: 26
____________________________________________________




Est: 100
Depth: 10
Precision: 0.9461538461538461
Recall: 0.8785714285714286
Accurancy: 0.9784560143626571
Target: 1090
Error: 24
____________________________________________________




Est: 100
Depth: 20
Precision: 0.9457364341085271
Recall: 0.8714285714285714
Accurancy: 0.9775583482944344
Target: 1089
Error: 25
____________________________________________________




Est: 100
Depth: 30
Precision: 0.9606299212598425
Recall: 0.8714285714285714
Accurancy: 0.9793536804308797
Target: 1091
Error: 23
____________________________________________________
Est: 100
Depth: None
Precision: 0.9534883720930233
Recall: 0.8785714285714286
Accurancy: 0.9793536804308797
Target: 1091
Error: 23
____________________________________________________




### Exploring parameter setting using GridSearchCV

In [102]:
# para esse caso criamos uma variavel parametro que armazena como dicionario os valores de 
# estimados e de profundidade
# aqui usei o TfidfVectorizer
rf3 = RandomForestClassifier()
param = {
    'n_estimators': [10,150,300],
    'max_depth': [30, 60, 90, None]
}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_tfid_fit, data['H_or_S'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:10]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,3.882141,0.109441,0.138252,0.017505,30.0,300,"{'max_depth': 30, 'n_estimators': 300}",0.979354,0.982944,0.978456,0.978437,0.979335,0.979705,0.001669,1
5,3.775304,0.367057,0.125115,0.022763,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.980251,0.982944,0.978456,0.97664,0.979335,0.979525,0.002084,2
4,1.907645,0.031204,0.072499,0.005945,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.979354,0.983842,0.977558,0.977538,0.977538,0.979166,0.002441,3
7,1.917437,0.098307,0.065019,0.005043,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.977558,0.98474,0.978456,0.97664,0.978437,0.979166,0.002866,4
8,3.755039,0.13251,0.120971,0.004576,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.977558,0.982047,0.979354,0.977538,0.977538,0.978807,0.001765,5
1,1.896808,0.080023,0.07831,0.012279,30.0,150,"{'max_depth': 30, 'n_estimators': 150}",0.979354,0.982944,0.977558,0.975741,0.978437,0.978807,0.002388,6
10,2.076454,0.107108,0.065686,0.00751,,150,"{'max_depth': None, 'n_estimators': 150}",0.977558,0.981149,0.978456,0.978437,0.977538,0.978628,0.001323,7
11,2.932963,0.48248,0.110149,0.033415,,300,"{'max_depth': None, 'n_estimators': 300}",0.978456,0.982047,0.977558,0.975741,0.977538,0.978268,0.002085,8
9,0.116276,0.060701,0.031216,0.022564,,10,"{'max_depth': None, 'n_estimators': 10}",0.979354,0.980251,0.976661,0.973046,0.975741,0.977011,0.002586,9
0,0.109434,0.021924,0.021793,0.004388,30.0,10,"{'max_depth': 30, 'n_estimators': 10}",0.978456,0.982047,0.978456,0.969452,0.974843,0.976651,0.00426,10


In [103]:
# para esse caso criamos uma variavel parametro que armazena como dicionario os valores de 
# estimados e de profundidade
# aqui usei o CountVectorizer
rf3 = RandomForestClassifier()
param = {
    'n_estimators': [10,150,300],
    'max_depth': [30, 60, 90, None]
}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_count_feat, data['H_or_S'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.095473,0.0108,0.023042,0.007864,30.0,10,"{'max_depth': 30, 'n_estimators': 10}",0.980251,0.981149,0.981149,0.974843,0.984726,0.980424,0.003186,1
2,1.797102,0.117342,0.143978,0.016101,30.0,300,"{'max_depth': 30, 'n_estimators': 300}",0.979354,0.98474,0.979354,0.975741,0.979335,0.979705,0.002879,2
1,0.794018,0.113962,0.076048,0.010543,30.0,150,"{'max_depth': 30, 'n_estimators': 150}",0.978456,0.98474,0.977558,0.977538,0.978437,0.979346,0.002727,3
7,1.002928,0.158686,0.071441,0.007756,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.980251,0.985637,0.976661,0.975741,0.978437,0.979345,0.003505,4
10,0.926957,0.08583,0.074777,0.011756,,150,"{'max_depth': None, 'n_estimators': 150}",0.979354,0.985637,0.974865,0.977538,0.978437,0.979166,0.003566,5
