In [6]:
import itertools as it
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV

### Conjunto de dados

Primeiramente, vamos carregar as bases de dados e separar os rótulos que serão preditos.


In [14]:
# Características
df_sociais = pd.read_csv("data/trabalho5_dados_sociais_4.csv")
df_modulo1 = pd.read_csv("data/trabalho5_dados_modulo1_4.csv")
df_modulo2 = pd.read_csv("data/trabalho5_dados_ateh_modulo2_4.csv")
# Rótulos
y = LabelEncoder().fit(["Sim", "Não"]).transform(df_modulo2["aprovado"])
#df_sociais = df_sociais.drop(["id", "aprovado"], axis=1)
#df_modulo1 = df_modulo1.drop(["id", "aprovado"], axis=1)
#df_modulo2 = df_modulo2.drop(["id", "aprovado"], axis=1)

In [20]:
df_modulo1

Unnamed: 0,id,quesm1,quesm1r,forum1,forum2,forum3,forum4,ativcolm1,ativcolm1r,forum1r,aprovado
0,7,1,0,1,0,1,1,1,0,0,Sim
1,33,1,0,1,1,1,1,1,0,1,Sim
2,45,1,0,1,1,1,1,1,0,0,Sim
3,50,1,0,0,0,1,0,1,0,0,Sim
4,58,0,0,0,0,0,0,0,0,0,Não
...,...,...,...,...,...,...,...,...,...,...,...
995,10859,1,1,1,1,1,1,1,1,1,Sim
996,10872,0,1,1,0,0,0,1,0,1,Sim
997,10873,1,1,0,1,1,1,1,0,1,Sim
998,10876,1,1,1,1,1,1,1,0,1,Sim


A base de dados sociais é a única que possui variáveis multi-categóricas e ordinais não processadas. Vamos pré-processá-las para que possam ser usadas pelos classificadores.


In [8]:
# Ordens das categorias
sn_cat = ["Sim", "Não"]
sexo_cat = ["Feminino", "Masculino"]
escolaridade_cat = [
    "Ensino Médio Completo",
    "Ensino Superior Incompleto",
    "Ensino Superior Completo",
    "Pós-graduação",
]
materialdidatico_cat = ["Adequado", "Muito adequado"]
prazoatividades_cat = [
    "Pouquíssimo flexível",
    "Pouco flexível",
    "Flexível",
    "Muito flexível",
]
interacaopares_cat = ["Importante", "Muito importante"]
organizacaocurso_cat = ["Organizado", "Muito organizado"]
import_ajud_tutor_cat = ["Às vezes", "Sempre"]
autoavaliacao_cat = [
    "Não, não considero",
    "Sim, considero, porém, poderia estar me esforçando mais",
    "Sim, considero",
]
pp_cat = [
    "Discordo totalmente",
    "Discordo",
    "Nem discordo, nem concordo",
    "Concordo",
    "Concordo totalmente",
]

In [9]:
# Variáveis ordinais
cols_ord = [
    "escolaridade",
    "materialdidatico",
    "prazoatividades",
    "interacaopares",
    "import.ajud.tutor",
    "autoavaliacao.x",
] + [f"pp{n + 1:03}" for n in range(37)]
cats_ord = [
    escolaridade_cat,
    materialdidatico_cat,
    prazoatividades_cat,
    interacaopares_cat,
    import_ajud_tutor_cat,
    autoavaliacao_cat,
] + [pp_cat] * 37

# Variáveis nominais
cols_nom = list(set(df_sociais.select_dtypes(object).columns) - set(cols_ord))

# Variáveis numéricas
cols_num = ["idade", "tempodeservico"]

In [10]:
def search_results(X, y, column_combinations, results_path, cache=True):
    try:
        if cache:
            with open(results_path, "rb") as file:
                search_results = pickle.load(file)
        else:
            raise FileNotFoundError()
    except FileNotFoundError:
        search_results = []
        for cols in column_combinations:
            cols_ord_ = list(set(cols_ord) & set(cols))
            cats_ord_ = [cats_ord[cols_ord.index(x)] for x in cols_ord_]
            cols_nom_ = list(set(cols_nom) & set(cols))
            cols_num_ = list(set(cols_num) & set(cols))

            transformers = []
            if cols_ord_:
                transformers.append((OrdinalEncoder(categories=cats_ord_), cols_ord_))
            if cols_nom_:
                transformers.append((OneHotEncoder(), cols_nom_))
            if cols_num_:
                transformers.append((StandardScaler(), cols_num_))

            steps = []
            if transformers:
                steps.append(("transformer", make_column_transformer(*transformers)))
            steps.append(("estimator", LogisticRegression())),  # dummy
            pipe = Pipeline(steps)

            search = GridSearchCV(
                pipe,
                param_grid=[
                    {
                        "estimator": [LogisticRegression()],
                    },
                    {
                        "estimator": [GradientBoostingClassifier()],
                    },
                    {
                        "estimator": [RandomForestClassifier()],
                    },
                ],
                n_jobs=-1,
            )
            _ = search.fit(X[cols], y)

            result = pd.DataFrame(search.cv_results_)
            result["columns"] = [cols] * len(result)
            search_results.append(result)

        search_results = pd.concat(search_results).sort_values(
            by="mean_test_score", ascending=False
        )
        with open(results_path, "wb") as file:
            pickle.dump(search_results, file)

    return search_results

### Dados Sociais


In [11]:
column_combinations = [
    list(cols)
    for n_cols in range(1, len(df_sociais.columns) + 1)
    for cols in it.islice(it.combinations(df_sociais.columns, n_cols), 2)
]
results_sociais = search_results(
    X=df_sociais,
    y=y,
    column_combinations=column_combinations,
    results_path="data/search_results_1.pkl",
    cache=False,
)
results_sociais.head()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,columns
0,0.023737,0.002221,0.006982,0.001544649,LogisticRegression(),{'estimator': LogisticRegression()},0.565,0.565,0.575,0.57,0.57,0.569,0.003742,1,"[idade, sexo, escolaridade, estadocivil, tempo..."
0,0.013688,0.006172,0.003608,0.0005835306,LogisticRegression(),{'estimator': LogisticRegression()},0.565,0.565,0.57,0.565,0.57,0.567,0.002449,1,[idade]
0,0.017353,0.004398,0.004588,0.0004883442,LogisticRegression(),{'estimator': LogisticRegression()},0.565,0.565,0.57,0.565,0.57,0.567,0.002449,1,"[idade, sexo]"
0,0.016556,0.000798,0.004987,1.040336e-06,LogisticRegression(),{'estimator': LogisticRegression()},0.565,0.565,0.57,0.565,0.57,0.567,0.002449,1,"[idade, sexo, estadocivil]"
0,0.017753,0.002631,0.005984,3.693565e-07,LogisticRegression(),{'estimator': LogisticRegression()},0.565,0.565,0.57,0.565,0.57,0.567,0.002449,1,"[idade, sexo, escolaridade]"


In [155]:
results_sociais.describe()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
count,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0
mean,0.147937,0.006166,0.012125,0.001450947,0.536677,0.47914,0.49028,0.506516,0.528538,0.50823,0.031404,1.978495
std,0.083695,0.005119,0.004491,0.001284821,0.026427,0.040995,0.036504,0.028118,0.017878,0.017663,0.011985,0.827576
min,0.004801,0.0004,0.0018,9.536743e-08,0.47,0.41,0.415,0.43,0.475,0.471,0.002,1.0
25%,0.056613,0.002136,0.009002,0.0004900961,0.525,0.45,0.46,0.485,0.52,0.498,0.025179,1.0
50%,0.168237,0.004119,0.012603,0.001020039,0.535,0.465,0.49,0.51,0.525,0.51,0.031718,2.0
75%,0.231052,0.009917,0.015204,0.001959927,0.555,0.51,0.515,0.525,0.535,0.515,0.039623,3.0
max,0.296267,0.03649,0.024602,0.009606115,0.605,0.58,0.575,0.57,0.575,0.569,0.061887,3.0


### Dados dos módulos

In [152]:
df_modulo = pd.concat([df_modulo1, df_modulo2], axis=1)

column_combinations = [
    list(cols)
    for n_cols in range(1, len(df_modulo.columns) + 1)
    for cols in it.islice(it.combinations(df_modulo.columns, n_cols), 2)
]
results_modulos = search_results(
    X=df_modulo,
    y=y,
    column_combinations=column_combinations,
    results_path="data/search_results_1.pkl",
    cache=False,
)
results_modulos.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,columns
0,0.008221,0.000398,0.001396,0.0004847232,LogisticRegression(),{'estimator': LogisticRegression()},0.675,0.715,0.74,0.73,0.705,0.713,0.022494,1,"[quesm1, quesm1r, forum1, forum2, forum3, foru..."
0,0.00761,0.000589,0.001609,0.0005820859,LogisticRegression(),{'estimator': LogisticRegression()},0.675,0.715,0.74,0.72,0.715,0.713,0.021119,1,"[quesm1, quesm1r, forum1, forum2, forum3, foru..."
0,0.007499,0.000449,0.0014,0.0004899792,LogisticRegression(),{'estimator': LogisticRegression()},0.64,0.735,0.74,0.715,0.72,0.71,0.036194,1,"[quesm1, quesm1r, forum1, forum2, forum3, foru..."
1,0.111025,0.002098,0.002201,0.0004003764,GradientBoostingClassifier(),{'estimator': GradientBoostingClassifier()},0.675,0.75,0.685,0.745,0.69,0.709,0.031843,1,"[quesm1, quesm1r, forum1, forum2, forum3, foru..."
1,0.120027,0.002608,0.002,4.15697e-07,GradientBoostingClassifier(),{'estimator': GradientBoostingClassifier()},0.675,0.75,0.685,0.745,0.69,0.709,0.031843,1,"[quesm1, quesm1r, forum1, forum2, forum3, foru..."


In [156]:
results_modulos.describe()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
count,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0,159.0
mean,0.080293,0.002761748,0.004076,0.000564,0.642107,0.68827,0.675535,0.686761,0.669151,0.672365,0.023981,1.918239
std,0.05736,0.002477731,0.003544,0.000404,0.043811,0.053796,0.047409,0.046307,0.042252,0.042738,0.008595,0.834149
min,0.003201,1.168008e-07,0.001,0.0,0.52,0.575,0.565,0.585,0.55,0.573,0.002,1.0
25%,0.009002,0.0008001328,0.001497,0.0004,0.6325,0.65,0.65,0.655,0.65,0.6565,0.016793,1.0
50%,0.099223,0.00199995,0.001803,0.00049,0.655,0.705,0.685,0.695,0.69,0.692,0.025962,2.0
75%,0.130429,0.004297512,0.008202,0.000748,0.675,0.73,0.7,0.72,0.7,0.704,0.031401,3.0
max,0.162836,0.01303535,0.012003,0.001624,0.705,0.75,0.765,0.76,0.72,0.713,0.040125,3.0


### Todos os dados

In [153]:
df_all = pd.concat([df_sociais, df_modulo1, df_modulo2], axis=1)

column_combinations = [
    list(cols)
    for n_cols in range(1, len(df_all.columns) + 1)
    for cols in it.islice(it.combinations(df_all.columns, n_cols), 2)
]
results_all = search_results(
    X=df_all,
    y=y,
    column_combinations=column_combinations,
    results_path="data/search_results_1.pkl",
    cache=False,
)
results_all.head()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,columns
0,0.016204,0.001327,0.004201,0.0004003525,LogisticRegression(),{'estimator': LogisticRegression()},0.565,0.565,0.575,0.57,0.57,0.569,0.003742,1,"[idade, sexo, escolaridade, estadocivil, tempo..."
0,0.005601,0.00049,0.001801,0.0003997088,LogisticRegression(),{'estimator': LogisticRegression()},0.565,0.565,0.57,0.565,0.57,0.567,0.002449,1,[idade]
0,0.015203,0.0016,0.004001,1.907349e-07,LogisticRegression(),{'estimator': LogisticRegression()},0.565,0.565,0.57,0.565,0.57,0.567,0.002449,1,"[idade, sexo, escolaridade, estadocivil]"
0,0.011803,0.000749,0.003001,2.861023e-07,LogisticRegression(),{'estimator': LogisticRegression()},0.565,0.565,0.57,0.565,0.57,0.567,0.002449,1,"[idade, sexo, estadocivil]"
0,0.011203,0.000749,0.003601,0.0004902128,LogisticRegression(),{'estimator': LogisticRegression()},0.565,0.565,0.57,0.565,0.57,0.567,0.002449,1,"[idade, sexo, escolaridade]"


In [154]:
results_all.describe()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
count,627.0,627.0,627.0,627.0,627.0,627.0,627.0,627.0,627.0,627.0,627.0,627.0
mean,0.161358,0.007874,0.01365,0.002036335,0.537041,0.476834,0.485598,0.507632,0.527464,0.506914,0.031957,1.979266
std,0.088646,0.006286,0.004773,0.002236802,0.023796,0.038396,0.035999,0.029031,0.015918,0.016701,0.011113,0.825638
min,0.005601,0.0004,0.0014,1.168008e-07,0.47,0.41,0.405,0.43,0.48,0.47,0.002,1.0
25%,0.059213,0.003213,0.011003,0.0007433807,0.525,0.45,0.46,0.485,0.52,0.496,0.026721,1.0
50%,0.190843,0.006257,0.013803,0.001356853,0.535,0.46,0.485,0.515,0.525,0.511,0.031401,2.0
75%,0.245055,0.011703,0.01747,0.002512795,0.555,0.51,0.51,0.535,0.535,0.514,0.039774,3.0
max,0.377084,0.085963,0.036008,0.0235083,0.6,0.58,0.575,0.57,0.575,0.569,0.062338,3.0
