In [12]:
import pandas as pd
import warnings

# Consertando a exibição do describe
pd.set_option('display.float_format', lambda x: '%.2f' % x)
# Ignorando avisos de depreciação
warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [13]:
# Carregando os dados do ENEM 2023 preprocessados
enemData = pd.read_csv('./Preprocessamento/PreprocessedDataframe.csv', encoding='utf-8', sep=';')

In [14]:
# Exibindo o formato dos dados
enemData.shape

(3933955, 7)

In [15]:
enemData.describe()

Unnamed: 0,TP_ST_CONCLUSAO,SG_UF_PROVA_NUM,Q006,Q010,Q011,PRESENCA_COMPLETA,GRUPO_ETARIO
count,3933955.0,3933955.0,3933955.0,3933955.0,3933955.0,3933955.0,3933955.0
mean,1.68,14.86,3.49,0.56,0.27,0.68,1.48
std,0.75,7.53,3.53,0.71,0.52,0.47,0.77
min,1.0,1.0,0.0,0.0,0.0,0.0,1.0
25%,1.0,9.0,1.0,0.0,0.0,0.0,1.0
50%,2.0,15.0,2.0,0.0,0.0,1.0,1.0
75%,2.0,21.0,5.0,1.0,0.0,1.0,2.0
max,4.0,27.0,16.0,4.0,4.0,1.0,4.0


In [16]:
# Exibindo informações básicas dos dados
enemData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3933955 entries, 0 to 3933954
Data columns (total 7 columns):
 #   Column             Dtype
---  ------             -----
 0   TP_ST_CONCLUSAO    int64
 1   SG_UF_PROVA_NUM    int64
 2   Q006               int64
 3   Q010               int64
 4   Q011               int64
 5   PRESENCA_COMPLETA  int64
 6   GRUPO_ETARIO       int64
dtypes: int64(7)
memory usage: 210.1 MB


In [17]:
# Fazendo correlações
corr = enemData.corr()

In [18]:
# Selecionando as features (variáveis independentes)
X = enemData.iloc[:,[0,1,2,3,4,6]]

# Selecionando o target (variável dependente)
y = enemData.iloc[:,5]

In [19]:
# Separando os dados em conjuntos de treinamento e teste
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
# Verificando separamento
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3147164, 6)
(786791, 6)
(3147164,)
(786791,)


In [21]:
# Criando função que irá avaliar os modelos
def avaliar_modelo(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    # Calculando métricas de avaliação
    acuracia = accuracy_score(y_test, y_pred)
    precisao = precision_score(y_test, y_pred)
    revocacao = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    # Exibindo as métricas
    print(f"Acurácia: {acuracia:.2f}")
    print(f"Precisão: {precisao:.2f}")
    print(f"Revocação: {revocacao:.2f}")
    print(f"F1-score: {f1:.2f}")

In [22]:
# Treinando e avaliando Logistic Regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
print("Logistic Regression:")
avaliar_modelo(clf, X_train, X_test, y_train, y_test)

# Treinando e avaliando Decision Tree
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
print("\nDecision Tree:")
avaliar_modelo(clf, X_train, X_test, y_train, y_test)

# Treinando e avaliando Random Forest
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=10, random_state=0)
print("\nRandom Forest:")
avaliar_modelo(clf, X_train, X_test, y_train, y_test)

# Treinando e avaliando Perceptron
from sklearn.linear_model import Perceptron
clf = Perceptron(tol=1e-4, max_iter=10000, random_state=0)
print("\nPerceptron:")
avaliar_modelo(clf, X_train, X_test, y_train, y_test)

# Treinando e avaliando K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)
print("\nK-Nearest Neighbors:")
avaliar_modelo(clf, X_train, X_test, y_train, y_test)


Logistic Regression:
Acurácia: 0.69
Precisão: 0.71
Revocação: 0.91
F1-score: 0.80

Decision Tree:
Acurácia: 0.70
Precisão: 0.72
Revocação: 0.90
F1-score: 0.80

Random Forest:
Acurácia: 0.70
Precisão: 0.72
Revocação: 0.91
F1-score: 0.80

Perceptron:
Acurácia: 0.60
Precisão: 0.68
Revocação: 0.77
F1-score: 0.72

K-Nearest Neighbors:
Acurácia: 0.64
Precisão: 0.72
Revocação: 0.78
F1-score: 0.75
