In [90]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
import numpy as np
import pandas as pd
import scipy
from skopt import forest_minimize
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, precision_score, f1_score, accuracy_score, recall_score
from scipy.sparse import hstack
from lightgbm import LGBMClassifier

In [3]:
Xtrain = scipy.sparse.load_npz('./npz/features_questions_train.npz')
Xval = scipy.sparse.load_npz('./npz/features_questions_val.npz')

In [73]:
features = pd.read_csv('./csv_files/featuresDF_cleaned.csv',index_col=0).dropna()
target = features.target

In [74]:
features.drop(['answers_cleaned', 'question_len', 'desc_cleaned','target'], axis=1, inplace=True)

In [75]:
features

Unnamed: 0,product_id,questions_cleaned,questions_cleaned_len
0,0,som alto,8
1,0,entrada auxiliar,16
2,0,caixinha entrada auxiliar,25
3,0,boa tarde autonomia bateria,27
4,0,boa noite caixa carregando pode escutar música,46
...,...,...,...
913,176,conecta tbm ps,14
914,178,android quanto,14
915,178,suporta carta memória gb,24
916,179,normal saida ar boracos mascara furos,37


In [76]:
Xtrain, Xval, ytrain, yval =  train_test_split(features, target,random_state=0, test_size=0.4)

In [77]:
train_strings = Xtrain.questions_cleaned
val_strings = Xval.questions_cleaned

In [80]:
train_numbers = Xtrain.drop('questions_cleaned', axis=1)
val_numbers = Xval.drop('questions_cleaned',axis=1)

# 1.0 - LGBM

In [391]:
def tune_lgbm(args):
    ## Lgbm parametres
    lr = args[0]
    max_depth = args[1]
    min_child_samples = args[2]
    subsample = args[3]
    colsample_bytree = args[4]
    n_estimators = args[5]
    
    ## Tfidf parameters
    min_df = args[6]
    ngram_range = (1, args[7])
    
    vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    questions_bow_train = vectorizer.fit_transform(train_strings)
    questions_bow_val = vectorizer.transform(val_strings)
    
    #Xtrain_stack = hstack([train_numbers, questions_bow_train])
    #Xval_stack = hstack([val_numbers, questions_bow_val])
    
    mdl = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
                        min_child_samples=min_child_samples, subsample=subsample,
                        colsample_bytree=colsample_bytree, bagging_freq=1, n_estimators=n_estimators,
                        random_state=0, class_weight='balanced', n_jobs=6)
    
    mdl.fit(questions_bow_train, ytrain)
    
    p = mdl.predict(questions_bow_val)
    proba = mdl.predict_proba(questions_bow_val)[:,1]
    
    print(roc_auc_score(yval,proba))
    
    return -average_precision_score(yval, proba)
    

In [392]:
space = [(1e-3, 1e-1, 'log-uniform'), # lr
          (1, 10), # max_depth
          (1, 20), # min_child_samples
          (0.05, 1.), # subsample
          (0.05, 1.), # colsample_bytree
          (100,1000), # n_estimators
          (1,5), # min_df
          (1,5)] # ngram_range

In [None]:
res = forest_minimize(tune_lgbm, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

In [394]:
parameters = res.x

In [395]:
parameters

[0.021265933990205252,
 6,
 3,
 0.3247384439428554,
 0.13034845542275114,
 505,
 4,
 1]

In [402]:
# Best parameters
lr = parameters[0]
max_depth = parameters[1]
min_child_samples = parameters[2]
subsample = parameters[3]
colsample_bytree = parameters[4]
n_estimators = parameters[5]

## Tfidf parameters
min_df = parameters[6]
ngram_range = (1, parameters[7])

vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
questions_bow_train = vectorizer.fit_transform(train_strings)
questions_bow_val = vectorizer.transform(val_strings)

#Xtrain_stack = hstack([train_numbers, questions_bow_train])
#Xval_stack = hstack([val_numbers, questions_bow_val])

mdl = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
                    min_child_samples=min_child_samples, subsample=subsample,
                    colsample_bytree=colsample_bytree, bagging_freq=1, n_estimators=n_estimators,
                    random_state=0, class_weight='balanced', n_jobs=6)
    
mdl.fit(questions_bow_train, ytrain)

p_lgbm = mdl.predict(questions_bow_val)
proba_lgbm = mdl.predict_proba(questions_bow_val)[:,1]

In [403]:
from sklearn.metrics import classification_report, precision_score, accuracy_score, f1_score, recall_score, confusion_matrix, roc_auc_score, average_precision_score

In [404]:
print(classification_report(yval, p_lgbm))

              precision    recall  f1-score   support

         0.0       0.69      0.80      0.74       172
         1.0       0.78      0.67      0.72       186

    accuracy                           0.73       358
   macro avg       0.74      0.73      0.73       358
weighted avg       0.74      0.73      0.73       358



In [405]:
roc_auc_score(yval,proba_lgbm), average_precision_score(yval,proba_lgbm)

(0.7883377094273568, 0.7847519333889489)

In [409]:
print('Precision:',precision_score(yval, p_lgbm))
print()
print('Recall:', recall_score(yval, p_lgbm))
print()
print('F1-score:', f1_score(yval,p_lgbm))
print()
print('Accuracy:', accuracy_score(yval, p_lgbm))
print()
print('Roc Auc:', roc_auc_score(yval,proba_lgbm))
print()
print('AP:', average_precision_score(yval,proba_lgbm))

Precision: 0.78125

Recall: 0.6720430107526881

F1-score: 0.7225433526011561

Accuracy: 0.7318435754189944

Roc Auc: 0.7883377094273568

AP: 0.7847519333889489


**baseline**  
(0.7296137339055794, 0.8854166666666666, 0.8, 0.7690217391304348, 0.8421075994318181, 0.8410357590877169)

# 2.0 - Random Forest

In [283]:
vectorizer_rf = TfidfVectorizer(min_df=3, ngram_range=(1,3))
questions_bow_train_rf = vectorizer_rf.fit_transform(train_strings)
questions_bow_val_rf = vectorizer_rf.transform(val_strings)

In [284]:
Xtrain_stack_rf = hstack([train_numbers, questions_bow_train_rf])
Xval_stack_rf = hstack([val_numbers, questions_bow_val_rf])

In [411]:
mdl_rf =RandomForestClassifier(n_estimators=1000,random_state=0, min_samples_leaf=1, class_weight='balanced')
mdl_rf.fit(questions_bow_train_rf, ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [412]:
p_rf = mdl_rf.predict(questions_bow_val_rf)
proba_rf = mdl_rf.predict_proba(questions_bow_val_rf)[:,1]

In [413]:
print('Precision:',precision_score(yval, p_rf))
print()
print('Recall:', recall_score(yval, p_rf))
print()
print('F1-score:', f1_score(yval,p_rf))
print()
print('Accuracy:', accuracy_score(yval, p_rf))
print()
print('Roc Auc:', roc_auc_score(yval,proba_rf))
print()
print('AP:', average_precision_score(yval,proba_rf))

Precision: 0.7446808510638298

Recall: 0.7526881720430108

F1-score: 0.7486631016042781

Accuracy: 0.7374301675977654

Roc Auc: 0.7929638659664916

AP: 0.7802916631894881


# 3.0 - Ensemble

In [414]:
pd.DataFrame({'rf': proba_rf, 'lgbm': proba_lgbm}).corr()

Unnamed: 0,rf,lgbm
rf,1.0,0.800146
lgbm,0.800146,1.0


In [421]:
proba = 0.5*proba_lgbm + proba_rf*0.5
print('Roc Auc:', roc_auc_score(yval,proba))
print()
print('AP:', average_precision_score(yval,proba))

Roc Auc: 0.807248687171793

AP: 0.7933316193146969


In [None]:
LGBM
Roc Auc: 0.8434921230307577
AP: 0.8541532320183226
    
RF
Roc Auc: 0.8484308577144286
AP: 0.8273176969188567

In [422]:
# Will not use the ensemble?
# modelo estranho, metricas razoaveis e respostas estranhas

# 4.0 - Little Tests

In [370]:
import string
from nltk.corpus import stopwords
from string import punctuation
import re

In [368]:
def remove_punct(text):
    text = str(text)
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', ' ', text)
    
    return text.lower()

def remove_stops(text):
    clean = [word for word in text.split() if word.lower() not in stopwords.words('portuguese')]
    return ' '.join(clean)

In [390]:
x = [input()]

text_clean = [remove_stops(remove_punct(x))]

dict_info = {
    'product_id':[100],
    'question_cleaned_len': len(text_clean),
}

numbers = pd.DataFrame(dict_info)

x_vec = vectorizer.transform(text_clean)
stack = hstack([numbers, x_vec])

p = mdl.predict(stack)
proba = mdl.predict_proba(stack)

print('Previsão para a pergunta: ', int(p[0]))
print('Probabilidade: ', proba)

  Se eu comprar 10 peças o frete sai gratis?


Previsão para a pergunta:  0
Probabilidade:  [[0.8110941 0.1889059]]




In [427]:
x = [input()]
x_vec = vectorizer.transform(x)
p = mdl.predict(x_vec)

print('Previsão para a pergunta: ', int(p[0]))

 Esse aparelho funciona no modelo 12345


Previsão para a pergunta:  1
