In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

In [3]:
file_2019_2 = 'data/Notas-2019-2.xlsx'
file_2020_1 = 'data/Notas-2020-1.xlsx'
# Agregar columna AI a 2020-1 para quitar quienes retiraron el curso
df_2019_2 = pd.read_excel(file_2019_2, index_col=None, na_values=['NA'], nrows=381, usecols='R,AH')
df_2020_1 = pd.read_excel(file_2020_1, index_col=None, na_values=['NA'], nrows=382, usecols='S,AH')

# Rename columns
df_2019_2.columns = ['T00', 'Final']
df_2020_1.columns = ['T00', 'Final']

# Concatenate n stuff
concatenate = [df_2019_2, df_2020_1]
df = pd.concat(concatenate).dropna()
df = df[df.Final != 'P'].apply(pd.to_numeric)
df.index = range(760)
df = df.apply(lambda x: x*100, axis=1)
df

Unnamed: 0,T00,Final
0,100.0,100.000000
1,481.0,486.330556
2,613.0,400.000000
3,720.0,557.628571
4,659.0,739.863492
...,...,...
755,100.0,100.000000
756,550.0,550.000000
757,670.0,700.000000
758,210.0,110.000000


In [4]:
train_data = df[['T00', 'Final']].to_numpy()
train, test = train_test_split(train_data, test_size=0.3)

train_x, train_y  = train[:, [True, False]], train[:, [False, True]]
test_x, test_y = test[:, [True, False]], test[:, [False, True]]

train_x = train_x.astype('int')
train_y = train_y.astype('int')
test_x = test_x.astype('int')
test_y = test_y.astype('int')


gaussian = GaussianNB()
gaussian.fit(train_x, train_y)

pred_y = gaussian.predict(test_x)

pred_y



array([390, 390, 700, 100, 542, 568, 489, 479, 360, 700, 414, 644, 400,
       644, 516, 404, 700, 700, 542, 100, 110, 542, 255, 700, 400, 400,
       700, 390, 700, 420, 400, 100, 400, 301, 700, 170, 507, 700, 400,
       400, 170, 301, 340, 700, 390, 700, 542, 387, 170, 170, 700, 431,
       420, 700, 290, 400, 390, 700, 360, 700, 400, 459, 644, 240, 170,
       700, 170, 700, 459, 420, 568, 516, 700, 404, 516, 475, 700, 170,
       400, 568, 568, 253, 700, 504, 390, 700, 400, 170, 489, 170, 700,
       170, 459, 700, 700, 400, 700, 390, 400, 585, 290, 700, 606, 100,
       700, 617, 489, 390, 290, 507, 644, 431, 390, 700, 414, 489, 170,
       350, 585, 499, 253, 700, 350, 615, 414, 611, 504, 400, 700, 233,
       504, 700, 340, 100, 170, 504, 617, 240, 170, 431, 489, 414, 700,
       350, 700, 617, 489, 700, 542, 568, 700, 360, 400, 700, 617, 542,
       110, 469, 516, 431, 489, 412, 400, 542, 340, 459, 499, 170, 414,
       568, 170, 700, 340, 100, 700, 250, 516, 700, 390, 459, 39

In [5]:
from sklearn.metrics import confusion_matrix

f = lambda x: 1 if x > 395 else 0
bool_test = np.array(list(map(f, test_y)))
bool_pred = np.array(list(map(f, pred_y)))
matriz = confusion_matrix(bool_test, bool_pred)

print('Matriz de confusión')
print(matriz)

Matriz de confusión
[[ 20  22]
 [ 56 130]]


In [6]:
from sklearn.metrics import precision_score

precision = precision_score(bool_test, bool_pred)

precision_real = precision_score(test_y, pred_y, average='micro')

print('Precisión aprobados/reprobados')
print(precision)

print('Precisión de notas')
print(precision_real)

Precisión aprobados/reprobados
0.8552631578947368
Precisión de notas
0.04824561403508772
