In [116]:
import sys, os
sys.path.append(os.path.abspath("../src"))
from load_dataset import load_dataset
from utils import *


In [117]:
df_raw = load_dataset()
df = df_raw.copy()
df.columns = df.columns.str.strip()

# Splitting Data into training and testing set

In [118]:
X = df.drop(columns=["Exam_Score"])
y = df["Exam_Score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Data Cleaning

In [119]:
# Ordinal (fixed orders)
ord_maps = {
    "Parental_Involvement": ["Low","Medium","High"],
    "Access_to_Resources": ["Low","Medium","High"],
    "Extracurricular_Activities": ["No","Yes"],
    "Motivation_Level": ["Low","Medium","High"],
    "Internet_Access": ["No","Yes"],
    "School_Type": ["Public","Private"],
    "Family_Income": ["Low","Medium","High"],
    "Learning_Disabilities":["No","Yes"],
    "Teacher_Quality": ["Low","Medium","High"],
    "Peer_Influence": ["Negative","Neutral","Positive"],
    "Parental_Education_Level": ["High School","College","Postgraduate"],  # note Title() case
    "Distance_from_Home": ["Near","Moderate","Far"],
    "Gender":["Male","Female"]
}

Attendance: No cleaning required since the distribution is reasonable.


In [120]:
df["Sleep_Hours"].describe()

count    6607.00000
mean        7.02906
std         1.46812
min         4.00000
25%         6.00000
50%         7.00000
75%         8.00000
max        10.00000
Name: Sleep_Hours, dtype: float64

Parental_Involvement: No cleaning required since the distribution is reasonable. No null values.

- Mapping: "Low": 0, "Medium": 1, "High": 2

In [121]:
_train = X_train.copy(); X_test = X_test.copy()

X_train["Parental_Involvement"] = (
    X_train["Parental_Involvement"].astype(str).str.strip().str.title()
           .map({"Low": 0, "Medium": 1, "High": 2})
)

X_test["Parental_Involvement"] = (
    X_test["Parental_Involvement"].astype(str).str.strip().str.title()
          .map({"Low": 0, "Medium": 1, "High": 2})
)

Parental_Involvement: No cleaning required since the distribution is reasonable. No null values.

- Mapping: "Low": 0, "Medium": 1, "High": 2

In [122]:
df["Parental_Involvement"].describe()
df["Parental_Involvement"].isna().sum()

0

In [123]:
X_train = X_train.copy(); X_test = X_test.copy()

X_train["Parental_Involvement"] = (
    X_train["Parental_Involvement"].astype(str).str.strip().str.title()
           .map({"Low": 0, "Medium": 1, "High": 2})
)

X_test["Parental_Involvement"] = (
    X_test["Parental_Involvement"].astype(str).str.strip().str.title()
          .map({"Low": 0, "Medium": 1, "High": 2})
)

In [124]:
X_train["Parental_Involvement"].describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: Parental_Involvement, dtype: float64

Access_to_Resources: No cleaning required since the distribution is reasonable. No null values.

Mapping: "Low": 0, "Medium": 1, "High": 2

In [125]:
df["Access_to_Resources"].describe()
#df["Access_to_Resources"].isna().sum()

count                     6607
unique                       3
top        Medium             
freq                      3319
Name: Access_to_Resources, dtype: object

In [126]:
X_train = X_train.copy(); X_test = X_test.copy()

X_train["Access_to_Resources"] = (
    X_train["Access_to_Resources"].astype(str).str.strip().str.title()
           .map({"Low": 0, "Medium": 1, "High": 2})
)

X_test["Access_to_Resources"] = (
    X_test["Access_to_Resources"].astype(str).str.strip().str.title()
          .map({"Low": 0, "Medium": 1, "High": 2})
)

In [127]:
X_train["Access_to_Resources"].describe()

count    5285.000000
mean        1.098013
std         0.697572
min         0.000000
25%         1.000000
50%         1.000000
75%         2.000000
max         2.000000
Name: Access_to_Resources, dtype: float64

Extracurricular_Activities: No cleaning required since the distribution is reasonable. No null values.

Mapping: "No": 0, "Yes": 1

In [128]:
df["Extracurricular_Activities"].describe()
df["Extracurricular_Activities"].isna().sum()

0

In [129]:
X_train["Extracurricular_Activities"] = (
    X_train["Extracurricular_Activities"].astype(str).str.strip().str.title()
           .map({"No": 0, "Yes": 1})
)

X_test["Extracurricular_Activities"] = (
    X_test["Extracurricular_Activities"].astype(str).str.strip().str.title()
           .map({"No": 0, "Yes": 1})
)

In [130]:
X_train["Extracurricular_Activities"].describe()

count    5285.000000
mean        0.598486
std         0.490251
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: Extracurricular_Activities, dtype: float64

Sleep hours: No cleaning required since the distribution is reasonable.

In [131]:
df["Sleep_Hours"].describe()

count    6607.00000
mean        7.02906
std         1.46812
min         4.00000
25%         6.00000
50%         7.00000
75%         8.00000
max        10.00000
Name: Sleep_Hours, dtype: float64

Previous Scores: No cleaning required as the distribution is reasonable. No null values.

In [132]:
df["Previous_Scores"].describe()
df["Family_Income"].isna().sum()

0

Motivation Level:
- No cleaning required as the distribution is reasonable.

- Ordinal Mapping

In [133]:
X_train["Motivation_Level"].describe()

count                  5285
unique                    3
top        Medium          
freq                   2688
Name: Motivation_Level, dtype: object

In [134]:
X_train = X_train.copy(); X_test = X_test.copy()

X_train["Motivation_Level"] = (
    X_train["Motivation_Level"].astype(str).str.strip().str.title()
           .map({"Low": 0, "Medium": 1, "High": 2})
)

X_test["Motivation_Level"] = (
    X_test["Motivation_Level"].astype(str).str.strip().str.title()
          .map({"Low": 0, "Medium": 1, "High": 2})
)

In [135]:
X_train["Motivation_Level"].describe()

count    5285.000000
mean        0.901798
std         0.694146
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         2.000000
Name: Motivation_Level, dtype: float64

Internet_Access: No cleaning required as the distribution is reasonable. No null values.

- Mapping: "No" -> 0 , "Yes" -> 1

In [136]:
X_train = X_train.copy(); X_test = X_test.copy()

X_train["Internet_Access"] = (
    X_train["Internet_Access"].astype(str).str.strip().str.title()
           .map({"No": 0, "Yes": 1})
)

X_test["Internet_Access"] = (
    X_test["Internet_Access"].astype(str).str.strip().str.title()
          .map({"No": 0, "Yes": 1})
)

In [137]:
X_train["Parental_Involvement"].describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: Parental_Involvement, dtype: float64

In [138]:
X_train["Internet_Access"].describe()
X_train["Internet_Access"].isna().sum()

0

Tutoring_Sessions: No cleaning required as the distribution is reasonable. No null values.

In [139]:
X_train["Tutoring_Sessions"].isna().sum()
X_train["Tutoring_Sessions"].describe()

count    5285.000000
mean        1.499527
std         1.233464
min         0.000000
25%         1.000000
50%         1.000000
75%         2.000000
max         8.000000
Name: Tutoring_Sessions, dtype: float64

Family_Income: No cleaning required as the distribution is reasonable. No null values.

- Mapping: "Low" -> 0 , "Medium" -> 1 , "High" -> 1

In [140]:
X_train["Family_Income"].describe()
X_train["Family_Income"].isna().sum()
X_test["Family_Income"].describe()
X_test["Family_Income"].isna().sum()

0

In [141]:
X_train = X_train.copy(); X_test = X_test.copy()

X_train["Family_Income"] = (
    X_train["Family_Income"].astype(str).str.strip().str.title()
           .map({"Low": 0, "Medium": 1, "High": 2})
)

X_test["Family_Income"] = (
    X_test["Family_Income"].astype(str).str.strip().str.title()
          .map({"Low": 0, "Medium": 1, "High": 2})
)

In [142]:
X_train["Family_Income"].describe()

count    5285.000000
mean        0.785241
std         0.742519
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         2.000000
Name: Family_Income, dtype: float64

Teacher_Quality: No cleaning required as the distribution is reasonable. Found 60 null values , null values imputed to "Medium"

- Mapping: "Low" -> 0 , "Medium" -> 1 , "High" -> 1

In [143]:
X_train["Teacher_Quality"].describe()
#X_train["Teacher_Quality"].isna().sum()
# X_test["Teacher_Quality"].describe()
# X_test["Teacher_Quality"].isna().sum()

count                 5285
unique                   4
top        Medium         
freq                  3141
Name: Teacher_Quality, dtype: object

In [144]:
X_train = X_train.copy(); X_test = X_test.copy()

X_train["Teacher_Quality"] = (
    X_train["Teacher_Quality"]
      .astype("string")
      .str.strip()
      .str.title()
      .fillna("Medium")
      .map({"Low": 0, "Medium": 1, "High": 2})
)

X_test["Teacher_Quality"] = (
    X_test["Teacher_Quality"]
      .astype("string")
      .str.strip()
      .str.title()
      .fillna("Medium")
      .map({"Low": 0, "Medium": 1, "High": 2})
)


In [145]:
X_train["Teacher_Quality"].describe()
#X_train["Teacher_Quality"].isna().sum()

count    5225.000000
mean        1.200957
std         0.598779
min         0.000000
25%         1.000000
50%         1.000000
75%         2.000000
max         2.000000
Name: Teacher_Quality, dtype: float64

School_Type: No cleaning required as the distribution is reasonable. No null values.

- Mapping: "Public" -> 0 , "Private" -> 1

In [146]:
X_train["School_Type"].describe()
#X_train["School_Type"].isna().sum()
# X_test["School_Type"].describe()
# X_test["School_Type"].isna().sum()

count             5285
unique               2
top        Public     
freq              3683
Name: School_Type, dtype: object

In [147]:
X_train = X_train.copy(); X_test = X_test.copy()
X_train["School_Type"] = X_train["School_Type"].astype(str).str.strip().str.title().map({"Public":0, "Private":1})
X_test["School_Type"]  = X_test["School_Type"].astype(str).str.strip().str.title().map({"Public":0, "Private":1})

In [148]:
X_train["School_Type"].describe()

count    5285.000000
mean        0.303122
std         0.459651
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: School_Type, dtype: float64

Peer_Influence: No cleaning required as the distribution is reasonable. No null values.

In [149]:
X_train["Peer_Influence"].describe()
#X_train["Peer_Influence"].isna().sum()
# X_test["Peer_Influence"].describe()
# X_test["Peer_Influence"].isna().sum()

count                5285
unique                  3
top        Positive      
freq                 2152
Name: Peer_Influence, dtype: object

In [150]:
X_train = X_train.copy(); X_test = X_test.copy()

X_train["Peer_Influence"] = (X_train["Peer_Influence"].astype("string").str.strip().str.title().map({"Negative": 0, "Neutral": 1, "Positive": 2}))
X_test["Peer_Influence"] = (X_test["Peer_Influence"].astype("string").str.strip().str.title().map({"Negative": 0, "Neutral": 1, "Positive": 2}))

In [151]:
X_train["Peer_Influence"].describe()

count    5285.000000
mean        1.197919
std         0.759868
min         0.000000
25%         1.000000
50%         1.000000
75%         2.000000
max         2.000000
Name: Peer_Influence, dtype: float64

Physical_Activity: No cleaning required as the distribution is reasonable. No null values. No mapping required.

- Numerical Representation: 0 - 6

In [152]:
X_train["Physical_Activity"].describe()
X_train["Physical_Activity"].isna().sum()
# X_test["Physical_Activity"].describe()
# X_test["Physical_Activity"].isna().sum()

0

Learning_Disabilities: No cleaning required as the distribution is reasonable. No null values.

- Mapping: "No" -> 0 , "Yes" -> 1

In [153]:
X_train["Learning_Disabilities"].describe()
#X_train["Learning_Disabilities"].isna().sum()
# X_test["Learning_Disabilities"].describe()
# X_test["Learning_Disabilities"].isna().sum()

count                       5285
unique                         2
top        No                   
freq                        4704
Name: Learning_Disabilities, dtype: object

In [154]:
X_train = X_train.copy(); X_test = X_test.copy()

X_train["Learning_Disabilities"] = (
    X_train["Learning_Disabilities"].astype(str).str.strip().str.title()
           .map({"No": 0, "Yes": 1})
)

X_test["Learning_Disabilities"] = (
    X_test["Learning_Disabilities"].astype(str).str.strip().str.title()
          .map({"No": 0, "Yes": 1})
)

In [155]:
X_train["Learning_Disabilities"].describe()


count    5285.000000
mean        0.109934
std         0.312837
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: Learning_Disabilities, dtype: float64

Parental_Education_Level: No cleaning required as the distribution is reasonable. Found 69 null values in train, null values imputed to "High school"

- Mapping: "High School" -> 0, "College" -> 1, "Postgraduate" -> 2

In [156]:
X_train["Parental_Education_Level"].describe()
#X_train["Parental_Education_Level"].isna().sum()
# X_test["Parental_Education_Level"].describe()
# X_test["Parental_Education_Level"].isna().sum()

count                          5285
unique                            4
top        High School             
freq                           2591
Name: Parental_Education_Level, dtype: object

In [157]:
X_train = X_train.copy(); X_test = X_test.copy()

X_train["Parental_Education_Level"] = (
    X_train["Parental_Education_Level"]
      .astype("string")
      .str.strip()
      .str.title()
      .fillna("High School")
      .map({"High School": 0, "College": 1, "Postgraduate": 2})
)

X_test["Parental_Education_Level"] = (
    X_test["Parental_Education_Level"]
      .astype("string")
      .str.strip()
      .str.title()
      .fillna("High School")
      .map({"High School": 0, "College": 1, "Postgraduate": 2})
)

In [158]:
X_train["Parental_Education_Level"].describe()
X_train["Parental_Education_Level"].isna().sum()


69

 Distance_from_Home: No cleaning required as the distribution is reasonable. Found 51 null values in train, null values imputed to "Near"

Mapping: "Near" -> 0, "Moderate" -> 1, "Far" -> 2

In [159]:
X_train["Distance_from_Home"].describe()
X_train["Distance_from_Home"].isna().sum()
# X_test["Distance_from_Home"].describe()
# X_test["Distance_from_Home"].isna().sum()

0

In [160]:
X_train = X_train.copy(); X_test = X_test.copy()

X_train["Distance_from_Home"] = (
    X_train["Distance_from_Home"]
      .astype("string")
      .str.strip()
      .str.title()
      .fillna("Near")
      .map({"Near": 0, "Moderate": 1, "Far": 2})
)

X_test["Distance_from_Home"] = (
    X_test["Distance_from_Home"]
      .astype("string")
      .str.strip()
      .str.title()
      .fillna("Near")
      .map({"Near": 0, "Moderate": 1, "Far": 2})
)

In [161]:
X_train["Distance_from_Home"].describe()
X_train["Distance_from_Home"].isna().sum()
# X_test["Distance_from_Home"].describe()
# X_test["Distance_from_Home"].isna().sum()

51

Gender: No cleaning required as the distribution is reasonable.

- Mapping: Male -> 0 , Female -> 1

In [162]:
X_train["Gender"].describe()
X_train["Gender"].isna().sum()
# X_test["Gender"].describe()
# X_test["Gender"].isna().sum()

0

In [163]:
X_train = X_train.copy(); X_test = X_test.copy()
X_train["Gender"] = X_train["Gender"].astype(str).str.strip().str.title().map({"Male":0, "Female":1})
X_test["Gender"]  = X_test["Gender"].astype(str).str.strip().str.title().map({"Male":0, "Female":1})

In [164]:
X_train["Gender"].describe()

count    5285.000000
mean        0.422138
std         0.493947
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: Gender, dtype: float64

In [165]:
df.to_csv("../data/Student_Performance_Cleaned.csv", index=False)
print("Cleaned dataset saved successfully!")


Cleaned dataset saved successfully!
