In [1]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import matplotlib as plt
import pandas as pd 
import numpy as n

In [2]:
df = pd.read_csv('./student_performance_prediction.csv')
df.sample(10)

Unnamed: 0,Student ID,Study Hours per Week,Attendance Rate,Previous Grades,Participation in Extracurricular Activities,Parent Education Level,Passed
29125,S29126,8.2,59.5,39.2,No,Associate,Yes
37437,S37438,15.2,88.7,51.7,Yes,Doctorate,No
20725,S20726,15.0,82.4,63.5,Yes,Associate,Yes
2575,S02576,,124.2,55.5,No,High School,Yes
37849,S37850,7.0,65.8,32.8,No,,No
9551,S09552,10.4,72.2,43.9,No,High School,No
23311,S23312,13.9,97.6,98.6,No,Doctorate,No
21399,S21400,13.1,65.5,51.0,No,Doctorate,Yes
11616,S11617,8.3,72.1,39.1,No,Associate,No
18200,S18201,14.9,85.1,80.8,Yes,Bachelor,No


In [3]:
num_rows, num_columns = df.shape

print("El dataset tiene", num_rows, "filas.")

El dataset tiene 40000 filas.


In [5]:
# Contar valores NaN en cada columna
nan_counts = df.isna().sum()

# Mostrar el conteo de valores NaN por columna
print(nan_counts)

Student ID                                        0
Study Hours per Week                           1995
Attendance Rate                                1992
Previous Grades                                1994
Participation in Extracurricular Activities    2000
Parent Education Level                         2000
Passed                                         2000
dtype: int64


In [6]:
# Crear una copia del DataFrame
df_copy = df.copy()

In [7]:
# Verifica si los DataFrames son iguales
print(df.equals(df_copy))  

True


In [8]:
# Lista de las columnas que deseas llenar con la media
columns_to_fill = ['Study Hours per Week', 'Attendance Rate', 'Previous Grades']

In [9]:
# Llenar los valores nulos en cada columna con la media
for column in columns_to_fill:
    df_copy[column].fillna(df_copy[column].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy[column].fillna(df_copy[column].mean(), inplace=True)


In [10]:
# Contar valores NaN en cada columna
nan_counts1 = df_copy.isna().sum()

# Mostrar el conteo de valores NaN por columna
print(nan_counts1)

Student ID                                        0
Study Hours per Week                              0
Attendance Rate                                   0
Previous Grades                                   0
Participation in Extracurricular Activities    2000
Parent Education Level                         2000
Passed                                         2000
dtype: int64


In [11]:
for column in ['Participation in Extracurricular Activities', 'Passed']:
    mode_value = df_copy[column].mode()[0]
    df_copy[column].fillna(mode_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy[column].fillna(mode_value, inplace=True)


In [12]:
# Contar valores NaN en cada columna
nan_counts1 = df_copy.isna().sum()

# Mostrar el conteo de valores NaN por columna
print(nan_counts1)

Student ID                                        0
Study Hours per Week                              0
Attendance Rate                                   0
Previous Grades                                   0
Participation in Extracurricular Activities       0
Parent Education Level                         2000
Passed                                            0
dtype: int64


In [13]:
# Convertir "Yes" en 1 y "No" en 0 para las columnas 'Participation' y 'Completed'
df_copy['Participation in Extracurricular Activities'] = df_copy['Participation in Extracurricular Activities'].map({'Yes': 1, 'No': 0})
df_copy['Passed'] = df_copy['Passed'].map({'Yes': 1, 'No': 0})

# Verificar la conversión
print(df_copy[['Participation in Extracurricular Activities', 'Passed']].head())

   Participation in Extracurricular Activities  Passed
0                                            1       1
1                                            0       0
2                                            0       0
3                                            1       0
4                                            0       0


In [14]:
# Contar valores NaN en cada columna
nan_counts1 = df_copy.isna().sum()

# Mostrar el conteo de valores NaN por columna
print(nan_counts1)

Student ID                                        0
Study Hours per Week                              0
Attendance Rate                                   0
Previous Grades                                   0
Participation in Extracurricular Activities       0
Parent Education Level                         2000
Passed                                            0
dtype: int64


In [15]:
# Ver los diferentes niveles académicos en la columna 'Parent Education Level'
niveles_educacion = df_copy['Parent Education Level'].unique()

# Mostrar los niveles
print(niveles_educacion)

['Master' 'High School' 'Associate' 'Bachelor' 'Doctorate' nan]


In [16]:
# Calcular la moda (el valor más frecuente) de la columna 'Parent Education Level'
moda_nivel_educacion = df_copy['Parent Education Level'].mode()[0]

# Reemplazar los valores NaN por la moda
df_copy['Parent Education Level'].fillna(moda_nivel_educacion, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy['Parent Education Level'].fillna(moda_nivel_educacion, inplace=True)


In [19]:
# Ver los diferentes niveles académicos en la columna 'Parent Education Level'
niveles_educacion = df_copy['Parent Education Level'].unique()

# Mostrar los niveles
print(niveles_educacion)

[4 3 0 1 2]


In [18]:
df_copy['Parent Education Level'] = LabelEncoder().fit_transform(df_copy['Parent Education Level'])

In [20]:
train_cols = [
    'Study Hours per Week', 
    'Attendance Rate', 
    'Previous Grades', 
    'Participation in Extracurricular Activities', 
    'Parent Education Level',  
    'Passed'
]

X = df_copy[train_cols]
y = df_copy['Passed']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=50)

In [22]:
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test-scaler.transform(X_test)

Unnamed: 0,Study Hours per Week,Attendance Rate,Previous Grades,Participation in Extracurricular Activities,Parent Education Level,Passed
27019,14.289821,78.625201,67.001507,-0.053677,1.395951,0.050304
13856,6.093475,82.139677,66.719984,0.949057,1.395951,0.050304
32445,12.857450,86.983953,77.230176,-0.053677,1.395951,0.050304
24754,13.732788,79.100130,60.151114,0.949057,1.677260,1.052968
2666,13.334907,57.633336,47.670262,0.949057,2.239876,0.050304
...,...,...,...,...,...,...
1016,7.366694,88.123783,86.989640,0.949057,1.395951,1.052968
3920,17.393292,75.278208,65.781574,-0.053677,1.958568,1.052968
22782,14.528549,72.641095,51.423902,-0.053677,2.239876,0.050304
21996,2.751275,51.649229,84.737456,-0.053677,1.395951,0.050304


In [23]:
clf=LogisticRegression(C=1e10)

In [24]:
clf.fit(X_train,y_train)

In [25]:
y_pred = clf.predict(X_test)



In [26]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_test, y_pred)
print(confusion)

[[   0 6280]
 [   0 6920]]


In [27]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(class_weight='balanced')
model.fit(X_train, y_train)

In [29]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)



In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [32]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [33]:
y_pred = rf_model.predict(X_test)

In [34]:
# Matriz de confusión
confusion = confusion_matrix(y_test, y_pred)
print("Matriz de confusión:")
print(confusion)

# Precisión del modelo
accuracy = accuracy_score(y_test, y_pred)
print(f"Precisión: {accuracy}")

# Reporte de clasificación
report = classification_report(y_test, y_pred)
print("Reporte de clasificación:")
print(report)

Matriz de confusión:
[[5729    0]
 [   0 6271]]
Precisión: 1.0
Reporte de clasificación:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5729
           1       1.00      1.00      1.00      6271

    accuracy                           1.00     12000
   macro avg       1.00      1.00      1.00     12000
weighted avg       1.00      1.00      1.00     12000



In [35]:
y_test.mean()

np.float64(0.5225833333333333)