In [20]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import matplotlib as plt
import pandas as pd 
import numpy as n

In [60]:
df = pd.read_csv('./student_performance_prediction.csv')
df.sample(10)

Unnamed: 0,Student ID,Study Hours per Week,Attendance Rate,Previous Grades,Participation in Extracurricular Activities,Parent Education Level,Passed
1126,S01127,11.5,63.7,62.2,,Associate,No
27926,S27927,,104.3,79.2,No,Master,Yes
21938,S21939,11.9,114.2,83.6,Yes,Associate,No
32641,S32642,9.2,45.6,46.1,No,High School,No
39294,S39295,13.7,48.2,76.5,No,Associate,No
31670,S31671,3.7,82.1,71.7,,Master,Yes
1973,S01974,11.0,71.7,73.6,No,Bachelor,Yes
32212,S32213,7.2,56.9,36.5,Yes,Bachelor,Yes
34491,S34492,13.5,91.0,71.6,No,Master,No
27383,S27384,10.6,80.3,77.8,Yes,Bachelor,Yes


In [61]:
num_rows, num_columns = df.shape

print("El dataset tiene", num_rows, "filas.")

El dataset tiene 40000 filas.


In [62]:
# Contar valores NaN en cada columna
nan_counts = df.isna().sum()

# Mostrar el conteo de valores NaN por columna
print(nan_counts)

Student ID                                        0
Study Hours per Week                           1995
Attendance Rate                                1992
Previous Grades                                1994
Participation in Extracurricular Activities    2000
Parent Education Level                         2000
Passed                                         2000
dtype: int64


In [63]:
# Contar valores nulos y vacíos en cada columna
empty_counts = df.apply(lambda x: x.isnull().sum() + (x == '').sum())

# Mostrar el conteo de valores vacíos y nulos por columna
print(empty_counts)

Student ID                                        0
Study Hours per Week                           1995
Attendance Rate                                1992
Previous Grades                                1994
Participation in Extracurricular Activities    2000
Parent Education Level                         2000
Passed                                         2000
dtype: int64


In [64]:
# Obtener moda y frecuencia para columnas categóricas
for column in df.select_dtypes(include=['object']).columns:
    mode = df[column].mode()[0]
    print(f"Column: {column}")
    print(f"Mode: {mode}")
    print(f"Frequency of Mode: {df[column].value_counts().get(mode, 0)}")
    print()

Column: Student ID
Mode: S00001
Frequency of Mode: 1

Column: Participation in Extracurricular Activities
Mode: No
Frequency of Mode: 19028

Column: Parent Education Level
Mode: Bachelor
Frequency of Mode: 7685

Column: Passed
Mode: Yes
Frequency of Mode: 19011



In [65]:
# Obtener estadísticas descriptivas
statistics = df.describe(include='all')  # `include='all'` incluye columnas numéricas y categóricas
print(statistics)

       Student ID  Study Hours per Week  Attendance Rate  Previous Grades  \
count       40000          38005.000000     38008.000000     38006.000000   
unique      40000                   NaN              NaN              NaN   
top        S00001                   NaN              NaN              NaN   
freq            1                   NaN              NaN              NaN   
mean          NaN              9.962744        75.276323        65.440107   
std           NaN              5.031154        20.393418        16.503119   
min           NaN            -12.300000       -14.300000         8.300000   
25%           NaN              6.600000        61.600000        55.100000   
50%           NaN             10.000000        75.300000        65.200000   
75%           NaN             13.400000        88.800000        75.200000   
max           NaN             32.400000       150.200000       200.000000   

       Participation in Extracurricular Activities Parent Education Level  

In [66]:
# Crear una copia del DataFrame
df_copy = df.copy()

In [67]:
# Ejemplo de modificación: rellenar valores nulos con la media
df_copy.fillna(df_copy.mean(numeric_only=True), inplace=True)

In [68]:
# Verifica si los DataFrames son iguales
print(df.equals(df_copy))  

False


In [71]:
# Contar valores nulos y vacíos en cada columna
empty_counts1 = df_copy.apply(lambda x: x.isnull().sum() + (x == '').sum())

# Mostrar el conteo de valores vacíos y nulos por columna
print(empty_counts1)

Student ID                                        0
Study Hours per Week                              0
Attendance Rate                                   0
Previous Grades                                   0
Participation in Extracurricular Activities    2000
Parent Education Level                         2000
Passed                                         2000
dtype: int64


In [72]:
# Rellenar valores nulos con la moda
for column in ['Participation in Extracurricular Activities', 'Parent Education Level']:
    mode_value = df_copy[column].mode()[0]
    df_copy[column].fillna(mode_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy[column].fillna(mode_value, inplace=True)


In [79]:
# Contar valores nulos y vacíos en cada columna
empty_counts1 = df_copy.apply(lambda x: x.isnull().sum() + (x == '').sum())

# Mostrar el conteo de valores vacíos y nulos por columna
print(empty_counts1)

Student ID                                     0
Study Hours per Week                           0
Attendance Rate                                0
Previous Grades                                0
Participation in Extracurricular Activities    0
Parent Education Level                         0
Passed                                         0
dtype: int64


In [75]:
# Convertir la columna 'Passed' a valores binarios
df_copy['Passed'] = df_copy['Passed'].map({'Yes': 1, 'No': 0})

In [76]:
# Rellenar valores nulos en 'Passed' con la moda
mode_value = df_copy['Passed'].mode()[0]
df_copy['Passed'].fillna(mode_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy['Passed'].fillna(mode_value, inplace=True)


In [97]:
# Contar valores nulos y vacíos en cada columna
empty_counts1 = df_copy.apply(lambda x: x.isnull().sum() + (x == '').sum())

# Mostrar el conteo de valores vacíos y nulos por columna
print(empty_counts1)

Student ID                                         0
Study Hours per Week                               0
Attendance Rate                                    0
Previous Grades                                    0
Participation in Extracurricular Activities        0
Parent Education Level                             0
Passed                                         40000
dtype: int64


In [114]:
# Convertir 'Passed' a valores binarios
df_copy['Passed'] = df_copy['Passed'].map({'Yes': 1, 'No': 0})

In [118]:
print(df_copy.head())

  Student ID  Study Hours per Week  Attendance Rate  Previous Grades  \
0     S00001                  12.5        75.276323             75.0   
1     S00002                   9.3        95.300000             60.6   
2     S00003                  13.2        75.276323             64.0   
3     S00004                  17.6        76.800000             62.4   
4     S00005                   8.8        89.300000             72.7   

   Participation in Extracurricular Activities  Parent Education Level  Passed  
0                                          NaN                       4     NaN  
1                                          NaN                       3     NaN  
2                                          NaN                       0     NaN  
3                                          NaN                       1     NaN  
4                                          NaN                       4     NaN  


In [110]:
# Convertir 'Participation in Extracurricular Activities' a valores binarios
df_copy['Participation in Extracurricular Activities'] = df_copy['Participation in Extracurricular Activities'].map({'Yes': 1, 'No': 0})

In [100]:
df_copy['Parent Education Level'] = LabelEncoder().fit_transform(df_copy['Parent Education Level'])

In [108]:
# Contar valores nulos y vacíos en cada columna
empty_counts1 = df_copy.apply(lambda x: x.isnull().sum() + (x == '').sum())

# Mostrar el conteo de valores vacíos y nulos por columna
print(empty_counts1)

Student ID                                         0
Study Hours per Week                               0
Attendance Rate                                    0
Previous Grades                                    0
Participation in Extracurricular Activities    40000
Parent Education Level                             0
Passed                                         40000
dtype: int64


In [90]:
train_cols = [
    'Study Hours per Week', 
    'Attendance Rate', 
    'Previous Grades', 
    'Participation in Extracurricular Activities', 
    'Parent Education Level',  
    'Passed'
]

X = df_copy[train_cols]
y = df_copy['Passed']

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=50)

In [106]:
print(X_train.isnull().sum())

AttributeError: 'numpy.ndarray' object has no attribute 'isnull'

In [102]:
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test-scaler.transform(X_test)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Unnamed: 0,Study Hours per Week,Attendance Rate,Previous Grades,Participation in Extracurricular Activities,Parent Education Level,Passed
27019,14.289821,78.625201,67.001507,-0.053677,1.395951,
13856,6.093475,82.139677,66.719984,0.949057,1.395951,
32445,12.857450,86.983953,77.230176,-0.053677,1.395951,
24754,13.732788,79.100130,60.151114,0.949057,1.677260,
2666,13.334907,57.633336,47.670262,0.949057,2.239876,
...,...,...,...,...,...,...
1016,7.366694,88.123783,86.989640,0.949057,1.395951,
3920,17.393292,75.278208,65.781574,-0.053677,1.958568,
22782,14.528549,72.641095,51.423902,-0.053677,2.239876,
21996,2.751275,51.649229,84.737456,-0.053677,1.395951,


In [103]:
clf=LogisticRegression(C=1e10)

In [104]:
clf.fit(X_train,y_train)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values