In [61]:
import nltk
import re
import string
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split, GridSearchCV
import time

In [55]:
# criar o filtro, criar o data e Stemmer
stopword = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
data = pd.read_table('SMSSpamCollection.tsv', names=['S_or_h', 'Text'])

In [56]:
# funcao para contar porcentagem de pomtuacao 
def count_punc(text):
    count = sum([1 for char in text if char in string.punctuation])
    por_count = count / (len(text) - text.count(' '))
    return round(por_count,3)*100

In [57]:
data['total_char'] = data.Text.apply(lambda x : len(x) - x.count(' '))
data['por_punc'] = data.Text.apply(lambda x : count_punc(x))
data.head()

Unnamed: 0,S_or_h,Text,total_char,por_punc
0,ham,I've been searching for the right words to tha...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
2,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
3,ham,Even my brother is not like to speak with me. ...,62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1


In [58]:
# criando uma funcao para limpar o text, 1 - tirar pontucao e espaco,
# 2 - tokenize, 3 - stopword, Stemmer, cria espaco pra voltar a ser uma msg
def clean_text(text):
    text = ''.join([word for word in text if word not in string.punctuation])
    tokenize = re.split('\W+',text)
    text = ' '.join([ps.stem(word) for word in tokenize if word not in stopword])
    return text 


In [59]:
# criando o traino e o teste - 80/20
X_train, X_test, y_train, y_test = train_test_split(data[['Text','total_char','por_punc']], data['S_or_h'], test_size=0.2)

In [60]:
# vetorizando o texto usando o treino
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['Text'])

tfidf_train = tfidf_vect_fit.transform(X_train.Text)
tfidf_test = tfidf_vect_fit.transform(X_test.Text)

In [65]:
# agora vamos criar um novo dataframe concatenando 
# X_train com o tfidf_train, X_test com tfidf_test
X_train_vect = pd.concat([X_train[['total_char','por_punc']].reset_index(drop=True), 
                          pd.DataFrame(tfidf_train.toarray())],axis=1)
X_test_vect = pd.concat([X_test[['total_char','por_punc']].reset_index(drop=True), 
                          pd.DataFrame(tfidf_test.toarray())],axis=1)
X_train_vect.head()

Unnamed: 0,total_char,por_punc,0,1,2,3,4,5,6,7,...,34,35,36,37,38,39,40,41,42,43
0,18,5.6,0.450601,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,119,5.9,0.642099,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.053182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,91,6.6,0.606468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.150692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,109,2.8,0.412974,0.316903,0.0,0.096518,0.0,0.106644,0.223645,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,52,5.8,0.682368,0.0,0.0,0.173978,0.0,0.0,0.0,0.0,...,0.0,0.215793,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
# Random Forest Classifier
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

start = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = rf_model.predict(X_test_vect)
end = time.time()
pred_time  = (end - start)

precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')
print(f'precision {precision}')
print(f'recall {recall}')
print(f'Total Spam In Spam{(y_pred == y_test).sum()}')
print(f'Total Spam out spam{( y_pred != y_test).sum()}')
print(f'fit_time {fit_time}')
print(f'pred_time {pred_time}')



precision 0.9673202614379085
recall 0.896969696969697
Total Spam In Spam1092
Total Spam out spam22
fit_time 0.3520619869232178
pred_time 0.034364938735961914




In [71]:
# Gradient Boosting Classifier
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

start = time.time()
gb_model = gb.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = gb_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, support = score(y_test, y_pred , pos_label='spam', average='binary')
print(f'precision {precision}')
print(f'recal {recall}')
print(f'Number of Spam in Spam {(y_pred == y_test).sum()}')
print(f'Number of spam out Spam {(y_pred != y_test).sum()}')
print(f'Time fit {fit_time}')
print(f'Time pred {pred_time}')




precision 0.9798657718120806
recal 0.8848484848484849
Number of Spam in Spam 1092
Number of spam out Spam 22
Time fit 9.75463080406189
Time pred 0.009342193603515625


