In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
import numpy as np
import pandas as pd
import scipy
from skopt import forest_minimize
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, precision_score, f1_score, accuracy_score, recall_score, classification_report
from scipy.sparse import hstack
from lightgbm import LGBMClassifier

In [11]:
features = pd.read_csv('./csv_files/featuresDF_cleaned.csv', index_col=0).dropna()
target = features.target

In [12]:
features = features.drop(['target','question_len'],axis=1)

In [14]:
Xtrain, Xval, ytrain, yval =  train_test_split(features, target,random_state=0, test_size=0.4)

In [256]:
train_numeric = Xtrain[['product_id']]
val_numeric = Xval[['product_id']]

train_text = Xtrain[['answers_cleaned','desc_cleaned','questions_cleaned']]
val_text = Xval[['answers_cleaned','desc_cleaned','questions_cleaned']]

In [300]:
vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1,5))

questions_bow_train = vectorizer.fit_transform(train_text.questions_cleaned)
questions_bow_val = vectorizer.transform(val_text.questions_cleaned)

answers_bow_train = vectorizer.fit_transform(train_text.answers_cleaned)
answers_bow_val = vectorizer.transform(val_text.answers_cleaned)

desc_bow_train = vectorizer.fit_transform(train_text.desc_cleaned)
desc_bow_val = vectorizer.transform(val_text.desc_cleaned)

In [301]:
Xtrain_stack = hstack([questions_bow_train])
Xval_stack = hstack([questions_bow_val])

# Modeling

# RF

In [302]:
mdl = RandomForestClassifier(n_estimators=1000,random_state=0, class_weight='balanced')
mdl.fit(Xtrain_stack, ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [303]:
p_rf = mdl.predict(Xval_stack)
proba_rf = mdl.predict_proba(Xval_stack)[:,1]

In [304]:
print(classification_report(yval,p_rf))

              precision    recall  f1-score   support

         0.0       0.75      0.70      0.72       172
         1.0       0.74      0.78      0.76       186

    accuracy                           0.74       358
   macro avg       0.74      0.74      0.74       358
weighted avg       0.74      0.74      0.74       358



In [305]:
roc_auc_score(yval,proba), average_precision_score(yval,proba)

(0.8516816704176045, 0.8398157487041207)

In [306]:
precision_score(yval,p_rf), recall_score(yval,p_rf), f1_score(yval,p_rf), accuracy_score(yval,p_rf)

(0.7397959183673469,
 0.7795698924731183,
 0.7591623036649214,
 0.7430167597765364)

# LGBM

In [317]:
def tune_lgbm(args):
    ## Lgbm parametres
    lr = args[0]
    max_depth = args[1]
    min_child_samples = args[2]
    subsample = args[3]
    colsample_bytree = args[4]
    n_estimators = args[5]
    
    ## Tfidf parameters
    min_df = args[6]
    ngram_range = (1, args[7])
    
    vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    questions_bow_train = vectorizer.fit_transform(train_text.questions_cleaned)
    questions_bow_val = vectorizer.transform(val_text.questions_cleaned)
    
    Xtrain_stack = hstack([train_numeric,questions_bow_train])
    Xval_stack = hstack([val_numeric,questions_bow_val])
    
    mdl = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
                        min_child_samples=min_child_samples, subsample=subsample,
                        colsample_bytree=colsample_bytree, bagging_freq=1, n_estimators=n_estimators,
                        random_state=0, class_weight='balanced', n_jobs=6)
    
    mdl.fit(Xtrain_stack, ytrain)
    
    p = mdl.predict(Xval_stack)
    proba = mdl.predict_proba(Xval_stack)[:,1]
    
    print(roc_auc_score(yval,proba))
    
    return -average_precision_score(yval, proba)
    

In [318]:
space = [(1e-3, 1e-1, 'log-uniform'), # lr
          (1, 10), # max_depth
          (1, 20), # min_child_samples
          (0.05, 1.), # subsample
          (0.05, 1.), # colsample_bytree
          (100,1000), # n_estimators
          (1,5), # min_df
          (1,5)] # ngram_range

In [None]:
res = forest_minimize(tune_lgbm, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

In [320]:
parameters = res.x

In [321]:
parameters

[0.006187068367625307, 5, 3, 0.35655474283122957, 0.149808846705096, 925, 1, 1]

In [322]:
# Best parameters
lr = parameters[0]
max_depth = parameters[1]
min_child_samples = parameters[2]
subsample = parameters[3]
colsample_bytree = parameters[4]
n_estimators = parameters[5]

## Tfidf parameters
min_df = parameters[6]
ngram_range = (1, parameters[7])

vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)

questions_bow_train = vectorizer.fit_transform(train_text.questions_cleaned)
questions_bow_val = vectorizer.transform(val_text.questions_cleaned)

Xtrain_stack = hstack([questions_bow_train])
Xval_stack = hstack([questions_bow_val])

mdl = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
                    min_child_samples=min_child_samples, subsample=subsample,
                    colsample_bytree=colsample_bytree, bagging_freq=1, n_estimators=n_estimators,
                    random_state=0, class_weight='balanced', n_jobs=6)
    
mdl.fit(Xtrain_stack, ytrain)

p_lgbm = mdl.predict(Xval_stack)
proba_lgbm = mdl.predict_proba(Xval_stack)[:,1]

In [323]:
roc_auc_score(yval,proba_lgbm), average_precision_score(yval,proba_lgbm)

(0.7845555138784697, 0.7894329269058277)

In [324]:
precision_score(yval,p_lgbm), recall_score(yval,p_lgbm), f1_score(yval,p_lgbm), accuracy_score(yval,p_lgbm)

(0.8344827586206897,
 0.6505376344086021,
 0.7311178247734138,
 0.7513966480446927)

In [325]:
print(classification_report(yval,p_lgbm))

              precision    recall  f1-score   support

         0.0       0.69      0.86      0.77       172
         1.0       0.83      0.65      0.73       186

    accuracy                           0.75       358
   macro avg       0.76      0.76      0.75       358
weighted avg       0.77      0.75      0.75       358



# Tests

In [326]:
import string
from nltk.corpus import stopwords
from string import punctuation
import re

In [279]:
def remove_punct(text):
    text = str(text)
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', ' ', text)
    
    return text.lower()

def remove_stops(text):
    clean = [word for word in text.split() if word.lower() not in stopwords.words('portuguese')]
    return ' '.join(clean)

In [245]:
def predict_from_data(product_id,text):
    text_clean = remove_stops(remove_punct(text))
    text_list = [text_clean]
    
    dict_info = {
        'product_id': [product_id],
    }
    numeric = pd.DataFrame(dict_info)
    
    print(text_list)
    
    text_vec = vectorizer.transform(text_list)
    stack = hstack([numeric,text_vec])

    p = mdl.predict(stack)
    proba = mdl.predict_proba(stack)[:,1]
    
    print('Previsao:', p)


In [340]:
predict_from_data(0,'Tem entrada para pen drive e se tem a função de trocar de pastas do mesmo? Se não tiver manda o link de uma caixa que tenha essas descrições. Se vc tiver.')

['entrada pen drive função trocar pastas manda link caixa descrições vc']
Previsao: [1.]


In [242]:
x = [input()]
x_vec = vectorizer.transform(x)
p = mdl.predict(x_vec)

print('Previsão para a pergunta: ', int(p[0]))

 Olá quero 3 unidades com fretes grátis para 29903082


Previsão para a pergunta:  1
