In [145]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score

In [26]:
df = pd.read_csv('labeling - qea_nolabels.csv').dropna()
df2 = pd.read_csv('active_labels1_labeled.csv', index_col=0)

In [28]:
df2['new'] = 1 

In [29]:
df3 = pd.concat([df, df2.drop('p',axis=1)]).fillna(0)

In [65]:
X_train, X_val, y_train,y_val = train_test_split(df.questions, df.label, random_state=0, test_size=0.4)

In [66]:
vectorizer = TfidfVectorizer(min_df=3)
X_train_bow = vectorizer.fit_transform(X_train)
X_val_bow = vectorizer.transform(X_val)

In [67]:
mdl = RandomForestClassifier(n_estimators=1000,random_state=0, class_weight='balanced')
mdl.fit(X_train_bow,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [138]:
p = mdl.predict(X_val_bow)
proba = mdl.predict_proba(X_val_bow)[:,0] ## probabilidade para exemplos 0

In [139]:
print(classification_report(y_val, p))

              precision    recall  f1-score   support

         0.0       0.77      0.86      0.81       181
         1.0       0.58      0.42      0.49        80

    accuracy                           0.73       261
   macro avg       0.67      0.64      0.65       261
weighted avg       0.71      0.73      0.71       261



In [140]:
average_precision_score(y_val, proba), roc_auc_score(y_val, proba)

(0.2047524494795143, 0.2415400552486188)

In [141]:
print(confusion_matrix(y_val,p))

[[156  25]
 [ 46  34]]


In [142]:
pred_series = pd.Series(proba)

In [143]:
preds = pred_series.map(lambda x: 0 if x>0.65 else 1)

In [144]:
print(classification_report(y_val,preds))

              precision    recall  f1-score   support

         0.0       0.85      0.67      0.75       181
         1.0       0.50      0.72      0.59        80

    accuracy                           0.69       261
   macro avg       0.67      0.70      0.67       261
weighted avg       0.74      0.69      0.70       261



In [136]:
print(confusion_matrix(y_val, preds))

[[122  59]
 [ 22  58]]


In [163]:
recall_score(y_val, p)

0.425

In [164]:
precision_score(y_val, p)

0.576271186440678

In [87]:
(y_val==0).sum()/len(y_val)

0.6934865900383141

In [None]:
# 70 dos dados sao negativos
# 30% sao positivos

In [113]:
(preds==0).sum()/len(preds)

0.5517241379310345

In [None]:
# preds - 55% dos dados sao negativos
# 45% dos dados positivos

In [111]:
(p==0).sum()/len(p)

0.7739463601532567

In [134]:
x = [input()]
x_vec = vectorizer.transform(x)
p = mdl.predict(x_vec)

print('Previsão para a pergunta: ', int(p[0]))

 Qual cor?


Previsão para a pergunta:  0


In [153]:
from lightgbm import LGBMClassifier

In [158]:
args = [0.08265121231498246, 7, 1, 0.7251351011494334, 0.07547006552546137, 839, 2, 3]
lr = args[0]
max_depth = args[1]
min_child_samples = args[2]
subsample = args[3]
colsample_bytree = args[4]
n_estimators = args[5]

min_df = args[6]
ngram_range = (1, args[7])

vectorizer = TfidfVectorizer(min_df=3)
X_train_bow = vectorizer.fit_transform(X_train)
X_val_bow = vectorizer.transform(X_val)


mdl_lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth,
                    min_child_samples=min_child_samples, subsample=subsample,
                    colsample_bytree=colsample_bytree, bagging_freq=1, n_estimators=n_estimators,
                    random_state=0, class_weight='balanced', n_jobs=6)

mdl_lgbm.fit(X_train_bow, y_train)

p_lgbm = mdl_lgbm.predict_proba(X_val_bow)

In [166]:
proba_lgbm = mdl_lgbm.predict_proba(X_val_bow)[:,0]

In [167]:
lgbm_series = pd.Series(proba_lgbm)

In [168]:
lgbm_preds = lgbm_series.map(lambda x: 0 if x>0.65 else 1)

In [160]:
recall_score(y_val, p_lgbm)

0.575

In [161]:
precision_score(y_val, p_lgbm)

0.5822784810126582

In [165]:
print(classification_report(y_val,p_lgbm))

              precision    recall  f1-score   support

         0.0       0.81      0.82      0.82       181
         1.0       0.58      0.57      0.58        80

    accuracy                           0.74       261
   macro avg       0.70      0.70      0.70       261
weighted avg       0.74      0.74      0.74       261



In [169]:
recall_score(y_val, lgbm_preds)

0.625

In [170]:
precision_score(y_val, lgbm_preds)

0.5434782608695652

In [171]:
print(classification_report(y_val,lgbm_preds))

              precision    recall  f1-score   support

         0.0       0.82      0.77      0.79       181
         1.0       0.54      0.62      0.58        80

    accuracy                           0.72       261
   macro avg       0.68      0.70      0.69       261
weighted avg       0.74      0.72      0.73       261

