In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
import statsmodels.api as sm
import sklearn as sk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, confusion_matrix, classification_report, recall_score, f1_score, precision_score, plot_roc_curve, RocCurveDisplay, auc, roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.pipeline import Pipeline
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from feature_engine.encoding import OrdinalEncoder
import helpers as hp


### Binarias
- `school`: Escuela del estudiante. (binaria: 'GP' - Gabriel Pereira o 'MS' - Mousinho da Silveira).
- `sex`: Sexo del estudiante. (binaria: 'F' - Mujer o 'M' - Hombre).
- `address`: Ubicación de la casa del estudiante. (binaria: 'U' - urbana o 'R' - rural).
- `famsize`: Tamaño de la familia. (binaria: 'LE3' - less or equal to 3 or 'GT3' - greater than 3).
- `Pstatus`: Estado cohabitacional de los padres. (binaria: 'T' - cohabitando juntos o 'A' - viviendo separados).
- `schoolsup`: Apoyo educacional del colegio. (binaria: si o no).
- `famsup`: Apoyo educacional familiar. (binaria: si o no).
- `paid`: Clases particulares pagadas (matemáticas o portugués) (binaria: sí o no).
- `activities`: Actividades extracurriculares. (binaria: si o no).
- `nursery`: Asistió a guardería infantil. (binaria: si o no).
- `higher`: Desea proseguir estudios superiores (binaria: si o no).
- `internet`: Acceso a internet desde el hogar (binaria: si o no).
- `romantic`: Relación romántica (binaria: si o no).

### Categóricas (Nominales)
- `Mjob`: Ocupación de la madre. (nominal: 'teacher' profesora, 'health' relacionada a salud, 'services' (e.g. administración pública o policía), 'at_home' en casa u 'other' otra).
- `Fjob`: Ocupación del padre (nominal: 'teacher' profesor, 'health' relacionado a salud, 'services' (e.g. administración pública o policía), 'at_home' en casa u 'other' otra).
- `reason`: Razón para escoger la escuela (nominal: 'home' cercano a casa, 'reputation' reputación de la escuela, 'course' preferencia de cursos u 'other' otra).
- `guardian`: Apoderado del estudiante (nominal: 'mother' madre, 'father' padre u 'other' otro).

### Numéricas
- `age`: Edad del estudiante. (numérica: de 15 a 22).
- `Medu`: Nivel educacional de la madre. (numérica: 0 - ninguno, 1 - educación básica (4to), 2 - de 5to a 9, 3 - educación media, o 4 - educación superior).
- `Fedu`: Nivel educacional del padre. (numérica: 0 - ninguno, 1 - educación básica (4to), 2 - de 5to a 9, 3 - educación media, o 4 - educación superior).
- `failures`: Número de clases reprobadas. (numérica: n si 1<=n<3, de lo contrario 4).
- `famrel`: Calidad de las relaciones familiares. (numérica: de 1 - muy malas a 5 - excelentes).
- `freetime`: Tiempo libre fuera del colegio (numérica: de 1 - muy poco a 5 - mucho).
- `goout`: Salidas con amigos (numérica: de 1 - muy pocas a 5 - muchas).
- `Dalc`: Consumo de alcohol en día de semana (numérica: de 1 - muy bajo a 5 - muy alto).
- `Walc`: Consumo de alcohol en fines de semana (numérica: de 1 - muy bajo a 5 - muy alto).
- `health`: Estado de salud actual (numérica: from 1 - muy malo to 5 - muy bueno).
- `absences`: Cantidad de ausencias escolares (numérica: de 0 a 93).
- `traveltime`: Tiempo de viaje entre hogar y colegio. Se debe codificar como:
	* 1 si es menos de 15 min,
	* 2 si es de 15 a 30 min,
	* 3 si es de 30 min. a 1 hora,
	* 4 - si es más de 1 hora).
- `studytime`: Horas semanales dedicadas al estudio. Se debe codificar como:
	* 1 si es menos de 2 horas,
	* 2 si es de 2 a 5 horas,
	* 3 si es de 5 a 10 horas,
	* 4 si es más de 10 horas.

### Target (Numérico)
- `G1`: Notas durante el primer semestre (numérica: de 0 a 20). Este es uno de sus vectores objetivos para el modelo descriptivo.
- `G2`: Notas durante el segundo semestre (numérica: de 0 a 20). Este es uno de sus vectores objetivos para el modelo descriptivo.
- `G3`: Promedio final (numérica: de 0 a 20). Este es uno de sus vectores objetivos para el modelo descriptivo y el vector a predecir en el modelo predictivo.

### Resolución de aspectos adicionales a considerar:
### Parte 1: 
 - Importar correctamente la data con separador indicado
 - Recodificación de valores nulos
 - Reasignación y limpieza de 3 variables númericas ingresadas como string("age", "goout", "health)
 - No esta indicado, pero se imputa los nulos en variables categóricas por moda y para variables númericas por la media.

In [2]:
def load_students():
        df = pd.read_csv('students.csv',sep='|',index_col = 'Unnamed: 0', na_values=['nulidade', 'sem validade', 'zero'])
        df.age=df.age.str.replace('"','')
        df.goout=df.goout.str.replace('"','')
        df.health=df.health.str.replace('"','')
        #df.astype({'age': 'float', 'goout':'float','health':'float'})
        df.age = pd.to_numeric(df.age, errors='coerce')
        df.goout = pd.to_numeric(df.goout, errors='coerce')
        df.health = pd.to_numeric(df.health, errors='coerce')
        return df

In [3]:
df = load_students()
df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,,U,GT3,A,4.0,4.0,at_home,teacher,...,4.0,3.0,4.0,1.0,1.0,3.0,6.0,5.0,6,6.0
1,GP,F,17.0,U,GT3,T,1.0,1.0,at_home,other,...,5.0,3.0,3.0,1.0,1.0,3.0,4.0,5.0,5,6.0
2,GP,F,15.0,U,LE3,T,1.0,1.0,at_home,other,...,4.0,3.0,2.0,2.0,3.0,3.0,10.0,,8,10.0
3,GP,F,15.0,U,GT3,T,4.0,2.0,health,services,...,3.0,2.0,2.0,1.0,1.0,5.0,2.0,15.0,14,15.0
4,GP,F,,U,GT3,T,3.0,3.0,other,other,...,4.0,3.0,2.0,1.0,2.0,5.0,4.0,6.0,10,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20.0,U,LE3,A,2.0,2.0,services,services,...,5.0,5.0,4.0,4.0,5.0,4.0,11.0,9.0,9,9.0
391,MS,M,17.0,U,LE3,T,3.0,1.0,services,services,...,2.0,4.0,5.0,3.0,4.0,2.0,3.0,14.0,16,16.0
392,MS,M,21.0,R,GT3,T,1.0,1.0,other,other,...,5.0,5.0,3.0,3.0,3.0,3.0,3.0,10.0,8,7.0
393,MS,M,18.0,R,LE3,T,3.0,2.0,services,other,...,4.0,4.0,1.0,3.0,4.0,5.0,0.0,11.0,12,10.0


#### Modelo Explicativo:

In [4]:
#Definición de dataframes eliminando G1, G2 y G3 según corresponda
df1 = df.drop(["G2", "G3"], axis=1)
df2 = df.drop(["G1", "G3"], axis=1)
df3 = df.drop(["G1", "G2"], axis=1)

In [5]:
#df.select_dtypes(np.number)

In [6]:
oe = OrdinalEncoder(encoding_method="ordered", variables=['Mjob', 'Fjob', 'reason', 'guardian'])

In [7]:
df1.dropna(axis=0, subset=["G1"], inplace=True)
x1 = df1.drop(columns="G1") 
x1_cat = x1.select_dtypes(np.object_).columns.to_list()
x1_num = x1.select_dtypes(np.number).columns.to_list()
#X1 = sm.add_constant(x1) 
y1 = df1.G1
#display(y1.isna().value_counts())

mmi1 = MeanMedianImputer(imputation_method='median')
ci1 = CategoricalImputer(imputation_method='frequent')
oe = OrdinalEncoder(encoding_method="ordered", variables= x1_cat)
sc = SklearnTransformerWrapper(StandardScaler(), variables = x1_num)

X_mmi1 = mmi1.fit_transform(x1)
X_ci1 = ci1.fit_transform(X_mmi1)
X_oe = oe.fit_transform(X_ci1,y1)
X_sc = sc.fit_transform(X_oe)
X_final = sm.add_constant(X_sc)

display(X_final.shape)
modelg1 = sm.OLS(y1,X_final).fit()
modelg1.summary()


  x = pd.concat(x[::order], 1)


(391, 31)

0,1,2,3
Dep. Variable:,G1,R-squared:,0.296
Model:,OLS,Adj. R-squared:,0.238
Method:,Least Squares,F-statistic:,5.055
Date:,"Wed, 03 Aug 2022",Prob (F-statistic):,1.24e-14
Time:,22:51:39,Log-Likelihood:,-954.83
No. Observations:,391,AIC:,1972.0
Df Residuals:,360,BIC:,2095.0
Df Model:,30,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6.1197,1.227,4.988,0.000,3.707,8.533
school,0.0133,0.549,0.024,0.981,-1.067,1.094
sex,0.8240,0.349,2.360,0.019,0.137,1.511
age,-0.0229,0.186,-0.123,0.902,-0.389,0.343
address,0.2713,0.399,0.680,0.497,-0.513,1.056
famsize,0.3837,0.345,1.111,0.267,-0.295,1.063
Pstatus,-0.0719,0.516,-0.139,0.889,-1.087,0.943
Medu,-0.0637,0.219,-0.290,0.772,-0.495,0.368
Fedu,0.1833,0.203,0.903,0.367,-0.216,0.583

0,1,2,3
Omnibus:,7.56,Durbin-Watson:,2.164
Prob(Omnibus):,0.023,Jarque-Bera (JB):,4.951
Skew:,0.113,Prob(JB):,0.0841
Kurtosis:,2.497,Cond. No.,33.9


In [8]:
df2.dropna(axis=0, subset=["G2"], inplace=True)
x2 = df2.drop(columns="G2") 
x2_cat = x2.select_dtypes(np.object_).columns.to_list()
x2_num = x2.select_dtypes(np.number).columns.to_list()
#X1 = sm.add_constant(x1) 
y2 = df2.G2
#display(y1.isna().value_counts())

mmi2 = MeanMedianImputer(imputation_method='median')
ci2 = CategoricalImputer(imputation_method='frequent')
oe2 = OrdinalEncoder(encoding_method="ordered", variables= x2_cat)
sc2 = SklearnTransformerWrapper(StandardScaler(), variables = x2_num)

X_mmi2 = mmi2.fit_transform(x2)
X_ci2 = ci2.fit_transform(X_mmi2)
X_oe2 = oe2.fit_transform(X_ci2,y2)
X_sc2 = sc2.fit_transform(X_oe2)
X_final2 = sm.add_constant(X_sc2)


modelg2 = sm.OLS(y2,X_final2).fit()
modelg2.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,G2,R-squared:,0.265
Model:,OLS,Adj. R-squared:,0.204
Method:,Least Squares,F-statistic:,4.371
Date:,"Wed, 03 Aug 2022",Prob (F-statistic):,4.45e-12
Time:,22:51:39,Log-Likelihood:,-1022.5
No. Observations:,395,AIC:,2107.0
Df Residuals:,364,BIC:,2230.0
Df Model:,30,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.6779,1.422,3.994,0.000,2.882,8.474
school,-0.3971,0.637,-0.624,0.533,-1.649,0.855
sex,0.8313,0.402,2.066,0.040,0.040,1.623
age,-0.1880,0.214,-0.878,0.381,-0.609,0.233
address,0.5920,0.462,1.282,0.201,-0.316,1.500
famsize,0.5510,0.396,1.391,0.165,-0.228,1.330
Pstatus,0.3824,0.594,0.644,0.520,-0.786,1.551
Medu,0.1565,0.272,0.575,0.566,-0.379,0.692
Fedu,0.0403,0.239,0.169,0.866,-0.430,0.511

0,1,2,3
Omnibus:,16.864,Durbin-Watson:,2.05
Prob(Omnibus):,0.0,Jarque-Bera (JB):,19.603
Skew:,-0.422,Prob(JB):,5.54e-05
Kurtosis:,3.692,Cond. No.,35.6


In [9]:
df3.dropna(axis=0, subset=["G3"], inplace=True)
x3 = df3.drop(columns="G3") 
x3_cat = x3.select_dtypes(np.object_).columns.to_list()
x3_num = x3.select_dtypes(np.number).columns.to_list()
#X1 = sm.add_constant(x1) 
y3 = df3.G3
#display(y1.isna().value_counts())

mmi3 = MeanMedianImputer(imputation_method='median')
ci3 = CategoricalImputer(imputation_method='frequent')
oe3 = OrdinalEncoder(encoding_method="ordered", variables= x3_cat)
sc3 = SklearnTransformerWrapper(StandardScaler(), variables = x3_num)

X_mmi3 = mmi3.fit_transform(x3)
X_ci3 = ci3.fit_transform(X_mmi3)
X_oe3 = oe3.fit_transform(X_ci3,y3)
X_sc3 = sc3.fit_transform(X_oe3)
X_final3 = sm.add_constant(X_sc3)


modelg3 = sm.OLS(y3,X_final3).fit()
modelg3.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,G3,R-squared:,0.26
Model:,OLS,Adj. R-squared:,0.199
Method:,Least Squares,F-statistic:,4.231
Date:,"Wed, 03 Aug 2022",Prob (F-statistic):,1.56e-11
Time:,22:51:40,Log-Likelihood:,-1092.1
No. Observations:,392,AIC:,2246.0
Df Residuals:,361,BIC:,2369.0
Df Model:,30,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.7846,1.820,2.630,0.009,1.206,8.363
school,-1.1058,0.777,-1.423,0.156,-2.634,0.422
sex,1.1639,0.492,2.365,0.019,0.196,2.132
age,-0.3448,0.262,-1.316,0.189,-0.860,0.170
address,0.5991,0.564,1.063,0.288,-0.509,1.707
famsize,0.6525,0.485,1.345,0.179,-0.301,1.606
Pstatus,0.4204,0.733,0.574,0.566,-1.020,1.861
Medu,0.2876,0.328,0.877,0.381,-0.357,0.933
Fedu,-0.0514,0.289,-0.178,0.859,-0.620,0.517

0,1,2,3
Omnibus:,21.386,Durbin-Watson:,2.101
Prob(Omnibus):,0.0,Jarque-Bera (JB):,23.343
Skew:,-0.583,Prob(JB):,8.53e-06
Kurtosis:,3.267,Cond. No.,37.9


In [10]:


#x1 = sm.add_constant(x1)
#x2 = sm.add_constant(x2)
#x3 = sm.add_constant(x3)

#display(x1.dtypes)

predsg1 = modelg1.predict(X_final)
predsg2 = modelg2.predict(X_final2)
predsg3 = modelg3.predict(X_final3)

print("Modelo G1")
hp.evaluation(modelg1, y1, predsg1)

print("\nModelo G2")
hp.evaluation(modelg2, y2, predsg2)

print("\nModelo G3")
hp.evaluation(modelg3, y3, predsg3)

Modelo G1
AIC es : 1971.6680778553684
BIC es : 2094.698012214915
Condition Number: 33.91437742534795
R2: 0.2963971899981841
RMSE: 2.7817834669794537 

Modelo G2
AIC es : 2107.033893987305
BIC es : 2230.37935269924
Condition Number: 35.64164495415677
R2: 0.26484897074892066
RMSE: 3.2210647271747948 

Modelo G3
AIC es : 2246.2843780947796
BIC es : 2369.393495128284
Condition Number: 37.8766813304023
R2: 0.2601610648473476
RMSE: 3.9241057060109825 


In [11]:
#df = pd.read_csv('students.csv',sep='|',index_col = 'Unnamed: 0', na_values=['nulidade', 'sem validade', 'zero'])
#df.head()

In [12]:
#df.columns

In [13]:
#df.shape

In [14]:
#df.age=df.age.str.replace('"','')
#df.goout=df.goout.str.replace('"','')
#df.health=df.health.str.replace('"','')

#df

In [15]:
#df.info()

In [16]:
#Conversión de variables tipo string a variable númerica:
#df.age = pd.to_numeric(df.age, errors='coerce')
# df.goout = pd.to_numeric(df.goout, errors='coerce')
# df.health = pd.to_numeric(df.health, errors='coerce')

In [17]:
#Se comprueba que las 3 variables string fueron convertidas a variables númericas (se mantuvo los valores NaN):
# df.select_dtypes(np.number)

In [18]:
#Visualización de variables categóricas:
# df.select_dtypes(np.object_)

In [19]:
mmi = MeanMedianImputer(imputation_method='median')
df_num_snan = mmi.fit_transform(df.select_dtypes(np.number))
df_num_snan

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,17.0,4.0,4.0,2.0,2.0,0.0,4.0,3.0,4.0,1.0,1.0,3.0,6.0,5.0,6,6.0
1,17.0,1.0,1.0,1.0,2.0,0.0,5.0,3.0,3.0,1.0,1.0,3.0,4.0,5.0,5,6.0
2,15.0,1.0,1.0,1.0,2.0,3.0,4.0,3.0,2.0,2.0,3.0,3.0,10.0,11.0,8,10.0
3,15.0,4.0,2.0,1.0,3.0,0.0,3.0,2.0,2.0,1.0,1.0,5.0,2.0,15.0,14,15.0
4,17.0,3.0,3.0,1.0,2.0,0.0,4.0,3.0,2.0,1.0,2.0,5.0,4.0,6.0,10,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,20.0,2.0,2.0,1.0,2.0,2.0,5.0,5.0,4.0,4.0,5.0,4.0,11.0,9.0,9,9.0
391,17.0,3.0,1.0,2.0,1.0,0.0,2.0,4.0,5.0,3.0,4.0,2.0,3.0,14.0,16,16.0
392,21.0,1.0,1.0,1.0,1.0,3.0,5.0,5.0,3.0,3.0,3.0,3.0,3.0,10.0,8,7.0
393,18.0,3.0,2.0,3.0,1.0,0.0,4.0,4.0,1.0,3.0,4.0,5.0,0.0,11.0,12,10.0


In [20]:
ci = CategoricalImputer(imputation_method='frequent')
df_obj_snan = ci.fit_transform(df.select_dtypes(np.object_))
df_obj_snan

Unnamed: 0,school,sex,address,famsize,Pstatus,Mjob,Fjob,reason,guardian,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic
0,GP,F,U,GT3,A,at_home,teacher,course,mother,yes,no,no,no,yes,yes,no,no
1,GP,F,U,GT3,T,at_home,other,course,father,no,yes,no,no,no,yes,yes,no
2,GP,F,U,LE3,T,at_home,other,other,mother,yes,no,yes,no,yes,yes,yes,no
3,GP,F,U,GT3,T,health,services,home,mother,no,yes,yes,yes,yes,yes,yes,yes
4,GP,F,U,GT3,T,other,other,home,father,no,yes,yes,no,yes,yes,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,U,LE3,A,services,services,course,other,no,yes,yes,no,yes,yes,no,no
391,MS,M,U,LE3,T,services,services,course,mother,no,no,no,no,no,yes,yes,no
392,MS,M,R,GT3,T,other,other,course,other,no,no,no,no,no,yes,no,no
393,MS,M,R,LE3,T,services,other,course,mother,no,no,no,no,no,yes,yes,no


In [21]:
df = pd.concat([df_obj_snan, df_num_snan], axis=1)
df

Unnamed: 0,school,sex,address,famsize,Pstatus,Mjob,Fjob,reason,guardian,schoolsup,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,U,GT3,A,at_home,teacher,course,mother,yes,...,4.0,3.0,4.0,1.0,1.0,3.0,6.0,5.0,6,6.0
1,GP,F,U,GT3,T,at_home,other,course,father,no,...,5.0,3.0,3.0,1.0,1.0,3.0,4.0,5.0,5,6.0
2,GP,F,U,LE3,T,at_home,other,other,mother,yes,...,4.0,3.0,2.0,2.0,3.0,3.0,10.0,11.0,8,10.0
3,GP,F,U,GT3,T,health,services,home,mother,no,...,3.0,2.0,2.0,1.0,1.0,5.0,2.0,15.0,14,15.0
4,GP,F,U,GT3,T,other,other,home,father,no,...,4.0,3.0,2.0,1.0,2.0,5.0,4.0,6.0,10,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,U,LE3,A,services,services,course,other,no,...,5.0,5.0,4.0,4.0,5.0,4.0,11.0,9.0,9,9.0
391,MS,M,U,LE3,T,services,services,course,mother,no,...,2.0,4.0,5.0,3.0,4.0,2.0,3.0,14.0,16,16.0
392,MS,M,R,GT3,T,other,other,course,other,no,...,5.0,5.0,3.0,3.0,3.0,3.0,3.0,10.0,8,7.0
393,MS,M,R,LE3,T,services,other,course,mother,no,...,4.0,4.0,1.0,3.0,4.0,5.0,0.0,11.0,12,10.0


### Resolución de aspectos adicionales a considerar:
### Parte 2: 
 - Se asignan 0 y 1 a las variables binarias de acuerdo a la frecuencia de la variable, es decir, a la variable con mayor frecuencia se le asigna 0 y a la menor frecuencia 1.


In [22]:
binaria = df[['school', 'sex', 'address', 'famsize', 'Pstatus', 'schoolsup', 'famsup', 'paid', 'activities', 
    'nursery', 'higher', 'internet', 'romantic']]

In [23]:
for i in binaria:
    df[i]= np.where(df[i] == df[i].unique()[0], 0, 1)

binaria

Unnamed: 0,school,sex,address,famsize,Pstatus,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic
0,GP,F,U,GT3,A,yes,no,no,no,yes,yes,no,no
1,GP,F,U,GT3,T,no,yes,no,no,no,yes,yes,no
2,GP,F,U,LE3,T,yes,no,yes,no,yes,yes,yes,no
3,GP,F,U,GT3,T,no,yes,yes,yes,yes,yes,yes,yes
4,GP,F,U,GT3,T,no,yes,yes,no,yes,yes,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,U,LE3,A,no,yes,yes,no,yes,yes,no,no
391,MS,M,U,LE3,T,no,no,no,no,no,yes,yes,no
392,MS,M,R,GT3,T,no,no,no,no,no,yes,no,no
393,MS,M,R,LE3,T,no,no,no,no,no,yes,yes,no


In [24]:
df

Unnamed: 0,school,sex,address,famsize,Pstatus,Mjob,Fjob,reason,guardian,schoolsup,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,0,0,0,0,0,at_home,teacher,course,mother,0,...,4.0,3.0,4.0,1.0,1.0,3.0,6.0,5.0,6,6.0
1,0,0,0,0,1,at_home,other,course,father,1,...,5.0,3.0,3.0,1.0,1.0,3.0,4.0,5.0,5,6.0
2,0,0,0,1,1,at_home,other,other,mother,0,...,4.0,3.0,2.0,2.0,3.0,3.0,10.0,11.0,8,10.0
3,0,0,0,0,1,health,services,home,mother,1,...,3.0,2.0,2.0,1.0,1.0,5.0,2.0,15.0,14,15.0
4,0,0,0,0,1,other,other,home,father,1,...,4.0,3.0,2.0,1.0,2.0,5.0,4.0,6.0,10,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,1,1,0,1,0,services,services,course,other,1,...,5.0,5.0,4.0,4.0,5.0,4.0,11.0,9.0,9,9.0
391,1,1,0,1,1,services,services,course,mother,1,...,2.0,4.0,5.0,3.0,4.0,2.0,3.0,14.0,16,16.0
392,1,1,1,0,1,other,other,course,other,1,...,5.0,5.0,3.0,3.0,3.0,3.0,3.0,10.0,8,7.0
393,1,1,1,1,1,services,other,course,mother,1,...,4.0,4.0,1.0,3.0,4.0,5.0,0.0,11.0,12,10.0


In [25]:
#Comprobamos la binarización de algunas variables categoricas binarias descritas en el enunciado a 1 y 0 (se asigno 0 a la clase con mayor frecuencia):
df.dtypes

school          int32
sex             int32
address         int32
famsize         int32
Pstatus         int32
Mjob           object
Fjob           object
reason         object
guardian       object
schoolsup       int32
famsup          int32
paid            int32
activities      int32
nursery         int32
higher          int32
internet        int32
romantic        int32
age           float64
Medu          float64
Fedu          float64
traveltime    float64
studytime     float64
failures      float64
famrel        float64
freetime      float64
goout         float64
Dalc          float64
Walc          float64
health        float64
absences      float64
G1            float64
G2              int64
G3            float64
dtype: object

In [26]:
df.shape

(395, 33)

### Resolución de aspectos adicionales a considerar:
### Parte 3: 
 - Se asigna de 0 a n, a cada una de las clases de las columnas con variables categoricas nominales ("Mjob", "job", "reason", "guardian"), asignado 0 a la clase menos frecuente hasta n a la mas frecuente.

In [27]:
#catnominales = df[["Mjob", "Fjob", "reason", "guardian"]]
types_to_count = {"object", "category", "string"}
result = {
    col: df[col].value_counts()
    for col in df.columns[df.dtypes.isin(types_to_count)]
}
result

{}

In [28]:
cat_cols = df.select_dtypes(include=object).columns.tolist()
(pd.DataFrame(
    df[cat_cols]
    .melt(var_name='column', value_name='value')
    .value_counts())
.rename(columns={0: 'counts'})
.sort_values(by=['column', 'counts']))

Unnamed: 0_level_0,Unnamed: 1_level_0,counts
column,value,Unnamed: 2_level_1
Fjob,health,18
Fjob,at_home,20
Fjob,teacher,29
Fjob,services,111
Fjob,other,217
Mjob,health,33
Mjob,teacher,57
Mjob,at_home,59
Mjob,services,102
Mjob,other,144


In [29]:
#Según el acuerdo tomado en clases, la variable minoritaria (con menor frecuencia) será la que tome valor 0 y ahi cada clase ira tomando valor n de acuerdo con el aumento de frecuencia:
df.Fjob = df.Fjob.map({"health":0, "at_home":1,"teacher":2, "services":3, "other":4})
df.Mjob = df.Mjob.map({"health":0, "teacher":1,"at_home":2, "services":3, "other":4})
df.guardian = df.guardian .map({"other":0, "father":1, "mother":3 })
df.reason = df.reason.map({"other":0, "reputation":1,"home":2, "course":3})
df

#El Dataframe se encuentra depurado según las indicaciones


Unnamed: 0,school,sex,address,famsize,Pstatus,Mjob,Fjob,reason,guardian,schoolsup,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,0,0,0,0,0,2,2,3,3,0,...,4.0,3.0,4.0,1.0,1.0,3.0,6.0,5.0,6,6.0
1,0,0,0,0,1,2,4,3,1,1,...,5.0,3.0,3.0,1.0,1.0,3.0,4.0,5.0,5,6.0
2,0,0,0,1,1,2,4,0,3,0,...,4.0,3.0,2.0,2.0,3.0,3.0,10.0,11.0,8,10.0
3,0,0,0,0,1,0,3,2,3,1,...,3.0,2.0,2.0,1.0,1.0,5.0,2.0,15.0,14,15.0
4,0,0,0,0,1,4,4,2,1,1,...,4.0,3.0,2.0,1.0,2.0,5.0,4.0,6.0,10,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,1,1,0,1,0,3,3,3,0,1,...,5.0,5.0,4.0,4.0,5.0,4.0,11.0,9.0,9,9.0
391,1,1,0,1,1,3,3,3,3,1,...,2.0,4.0,5.0,3.0,4.0,2.0,3.0,14.0,16,16.0
392,1,1,1,0,1,4,4,3,0,1,...,5.0,5.0,3.0,3.0,3.0,3.0,3.0,10.0,8,7.0
393,1,1,1,1,1,3,4,3,3,1,...,4.0,4.0,1.0,3.0,4.0,5.0,0.0,11.0,12,10.0


#### Modelo Explicativo (G1, G2, G3)

In [30]:
#Definición de dataframes eliminando G1, G2 y G3 según corresponda
df1 = df.drop(["G2", "G3"], axis=1)
df2 = df.drop(["G1", "G3"], axis=1)
df3 = df.drop(["G1", "G2"], axis=1)

In [31]:
#Modelo Explicativo N°1-Usando G1:
import statsmodels.api as sm 

x1 = df1.drop(columns="G1") 
X1 = sm.add_constant(x1) 
y1 = df1.G1

modelG1 = sm.OLS(y1,X1).fit() 
modelG1.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,G1,R-squared:,0.27
Model:,OLS,Adj. R-squared:,0.21
Method:,Least Squares,F-statistic:,4.483
Date:,"Wed, 03 Aug 2022",Prob (F-statistic):,1.68e-12
Time:,22:51:41,Log-Likelihood:,-969.92
No. Observations:,395,AIC:,2002.0
Df Residuals:,364,BIC:,2125.0
Df Model:,30,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,11.1427,2.792,3.991,0.000,5.652,16.633
school,-0.0949,0.554,-0.171,0.864,-1.184,0.995
sex,0.7866,0.352,2.232,0.026,0.093,1.480
address,-0.2422,0.409,-0.592,0.554,-1.047,0.562
famsize,0.4682,0.346,1.355,0.176,-0.211,1.148
Pstatus,0.0423,0.519,0.082,0.935,-0.978,1.062
Mjob,-0.2091,0.128,-1.634,0.103,-0.461,0.043
Fjob,-0.3173,0.152,-2.082,0.038,-0.617,-0.018
reason,-0.1838,0.155,-1.184,0.237,-0.489,0.121

0,1,2,3
Omnibus:,7.253,Durbin-Watson:,2.134
Prob(Omnibus):,0.027,Jarque-Bera (JB):,5.198
Skew:,0.152,Prob(JB):,0.0744
Kurtosis:,2.527,Cond. No.,392.0


In [32]:
#Modelo Explicativo N°2-Usando G2:
x2 = df2.drop(columns="G2") 
X2 = sm.add_constant(x2) 
y2 = df2.G2

modelG2 = sm.OLS(y2,X2).fit() 
modelG2.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,G2,R-squared:,0.262
Model:,OLS,Adj. R-squared:,0.202
Method:,Least Squares,F-statistic:,4.316
Date:,"Wed, 03 Aug 2022",Prob (F-statistic):,7.18e-12
Time:,22:51:41,Log-Likelihood:,-1023.2
No. Observations:,395,AIC:,2108.0
Df Residuals:,364,BIC:,2232.0
Df Model:,30,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,15.3218,3.195,4.796,0.000,9.039,21.604
school,0.3653,0.634,0.576,0.565,-0.881,1.612
sex,0.8463,0.403,2.098,0.037,0.053,1.640
address,-0.5352,0.468,-1.143,0.254,-1.456,0.385
famsize,0.5696,0.395,1.441,0.151,-0.208,1.347
Pstatus,-0.3912,0.593,-0.659,0.510,-1.558,0.776
Mjob,-0.0839,0.146,-0.573,0.567,-0.372,0.204
Fjob,-0.1894,0.174,-1.086,0.278,-0.532,0.154
reason,-0.3000,0.178,-1.689,0.092,-0.649,0.049

0,1,2,3
Omnibus:,17.808,Durbin-Watson:,2.036
Prob(Omnibus):,0.0,Jarque-Bera (JB):,20.915
Skew:,-0.435,Prob(JB):,2.87e-05
Kurtosis:,3.716,Cond. No.,392.0


In [33]:
#Modelo Explicativo N°3-Usando G3:
x3 = df3.drop(columns="G3") 
X3 = sm.add_constant(x3) 
y3 = df3.G3

modelG3 = sm.OLS(y3,X3).fit() 
modelG3.summary()

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,G3,R-squared:,0.256
Model:,OLS,Adj. R-squared:,0.195
Method:,Least Squares,F-statistic:,4.182
Date:,"Wed, 03 Aug 2022",Prob (F-statistic):,2.32e-11
Time:,22:51:41,Log-Likelihood:,-1100.0
No. Observations:,395,AIC:,2262.0
Df Residuals:,364,BIC:,2385.0
Df Model:,30,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,15.6306,3.881,4.027,0.000,7.999,23.263
school,1.0054,0.770,1.305,0.193,-0.509,2.520
sex,1.2000,0.490,2.449,0.015,0.236,2.164
address,-0.6153,0.569,-1.082,0.280,-1.734,0.503
famsize,0.6425,0.480,1.337,0.182,-0.302,1.587
Pstatus,-0.5134,0.721,-0.712,0.477,-1.931,0.904
Mjob,-0.0456,0.178,-0.256,0.798,-0.395,0.304
Fjob,-0.3152,0.212,-1.488,0.138,-0.732,0.101
reason,-0.5168,0.216,-2.395,0.017,-0.941,-0.092

0,1,2,3
Omnibus:,22.069,Durbin-Watson:,2.109
Prob(Omnibus):,0.0,Jarque-Bera (JB):,24.201
Skew:,-0.593,Prob(JB):,5.56e-06
Kurtosis:,3.257,Cond. No.,392.0


In [34]:
from sklearn.metrics import r2_score, mean_squared_error

def evaluation(model, real, preds):
    print(f"AIC es : {model.aic}")
    print(f"BIC es : {model.bic}")
    print(f"Condition Number: {model.condition_number}")
    print(f"R2: {r2_score(real, preds)}")
    print(f"RMSE: {mean_squared_error(real, preds, squared=False)} ")
    

predsG1 = modelG1.predict(X1)
predsG2 = modelG2.predict(X2)
predsG3 = modelG3.predict(X3)

print("Modelo G1")
evaluation(modelG1, y1, predsG1)

print("\nModelo G2")
evaluation(modelG2, y2, predsG2)

print("\nModelo G3")
evaluation(modelG3, y3, predsG3)

Modelo G1
AIC es : 2001.8481749961168
BIC es : 2125.1936337080515
Condition Number: 392.0630808050208
R2: 0.26978822269720903
RMSE: 2.819516695754359 

Modelo G2
AIC es : 2108.3466955478925
BIC es : 2231.6921542598275
Condition Number: 392.0630808050208
R2: 0.26240159609767455
RMSE: 3.226421859139698 

Modelo G3
AIC es : 2262.06739004002
BIC es : 2385.412848751955
Condition Number: 392.0630808050208
R2: 0.2563418512358
RMSE: 3.9194717898102747 


#### Modelo Predictivo  (G3)

In [35]:
#Modelo usando todas las variables (menos G1 y G2):

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

X_train, X_test, y_train, y_test = train_test_split (X3, y3, test_size=0.2, random_state=42)

In [36]:
lr = LinearRegression()
lr.fit(X3,y3)

y_pred = lr.predict(X3)

print("R2:", r2_score (y3,y_pred))
print("RMSE:", mean_squared_error(y3,y_pred,squared=False))

R2: 0.2563418512358
RMSE: 3.9194717898102747


In [37]:
varpval = ["school", "address", "famsize", "Pstatus", "Mjob", "Fjob", "guardian", "schoolsup", "famsup", "paid",
            "activities", "nursery", "higher", "internet", "age", "Medu", "Fedu", "traveltime", "studytime", "famrel", 
            "freetime" "Dalc", "Walc", "health", "absences" ]