In [3]:
import pandas as pd
import numpy as np


In [280]:
np.random.seed(12)

data = {
    "Student_ID": range(1, 51),
    "Age": np.random.randint(17, 25, 50),
    "Study_Hours": np.random.normal(5, 2, 50),   # average 5 hrs/day
    "Attendance": np.random.randint(60, 90, 50),
    "Internal_Marks": np.random.randint(20, 50, 50),
    "Final_Marks": np.random.randint(30, 100, 50),
    "Gender": np.random.choice(["Male", "Female"], 50)
}

df = pd.DataFrame(data)

In [282]:
# Introduce missing values
import random
random.seed(40)
for i in range(0, 50):
    x = random.randint(0, 101)
    if (x>50):
        col = 1 + x%5
        df.iloc[i, col] = np.nan

In [284]:
# Introduce inconsistency (negative study hours)
df.loc[20, "Study_Hours"] = -3
df.loc[23, "Study_Hours"] = 25
df.loc[16, "Attendance"] = 120
df.loc[16, "Attendance"] = -20

In [286]:
display(df)

Unnamed: 0,Student_ID,Age,Study_Hours,Attendance,Internal_Marks,Final_Marks,Gender
0,1,20.0,4.319627,77.0,,81.0,Female
1,2,20.0,9.054597,86.0,35.0,,Male
2,3,23.0,3.83784,,39.0,93.0,Female
3,4,22.0,6.74627,64.0,29.0,56.0,Female
4,5,18.0,6.821131,87.0,34.0,34.0,Female
5,6,19.0,7.095291,79.0,31.0,60.0,Male
6,7,,2.620723,78.0,35.0,53.0,Female
7,8,20.0,,84.0,34.0,83.0,Male
8,9,,5.110604,71.0,25.0,62.0,Female
9,10,17.0,5.424422,72.0,21.0,57.0,Male


In [288]:
df.isnull().sum()

Student_ID        0
Age               6
Study_Hours       5
Attendance        7
Internal_Marks    3
Final_Marks       4
Gender            0
dtype: int64

# Handling missing values
* Replace nan values in study_hours with mean
* Replace nan values in attendance with mean
* Replace nan values in age with median
* Drop row if internal or final marks is 0

In [291]:
# replace nan values in study_hours with mean
df["Study_Hours"] = df["Study_Hours"].fillna(df["Study_Hours"].mean())

# replace nan values in attendance with mean
df["Attendance"] = df["Attendance"].fillna(df["Attendance"].mean())

# replace nan values in age with median
df["Age"] = df["Age"].fillna(df["Age"].median())

# drop row if internal or final marks is 0
df = df.dropna()

In [293]:
df.isnull().sum()

Student_ID        0
Age               0
Study_Hours       0
Attendance        0
Internal_Marks    0
Final_Marks       0
Gender            0
dtype: int64

# Handling inconsistencies
* Rule 1: Study hours should be between 0 and 12
* Rule 2: Attendance should be between 0 and 100

Inconsistent values are replaced by median

In [296]:
df.loc[df["Study_Hours"] < 0, "Study_Hours"] = df["Study_Hours"].median()
df.loc[df["Study_Hours"] > 12, "Study_Hours"] = df["Study_Hours"].median()
df.loc[df["Attendance"] > 100, "Attendance"] = df["Attendance"].median()
df.loc[df["Attendance"] < 0, "Attendance"] = df["Attendance"].median()

# Handle outliers by IQR method

In [321]:
# Introduce outliers
df.loc[2, "Attendance"] = 10
df.loc[4, "Internal_Marks"] = 3
df.loc[5, "Final_Marks"] = 4
df.loc[8, "Age"] = 30
df.loc[10, "Study_Hours"] = 11

In [323]:
numeric_cols = ["Age", "Study_Hours", "Attendance", "Internal_Marks", "Final_Marks"]

for col in numeric_cols:
    Q1 = np.percentile(df[col], 25, interpolation = 'midpoint') 
    Q2 = np.percentile(df[col], 50, interpolation = 'midpoint') 
    Q3 = np.percentile(df[col], 75, interpolation = 'midpoint') 
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    print(col, lower_bound, upper_bound)


    df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
    df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])


Age 12.5 28.5
Study_Hours 0.2817704416073248 9.664597964496064
Attendance 49.5 93.5
Internal_Marks 7.75 57.75
Final_Marks 7.75 129.75


In [325]:
display(df)


Unnamed: 0,Student_ID,Age,Study_Hours,Attendance,Internal_Marks,Final_Marks,Gender
2,3,23.0,3.83784,49.5,39.0,93.0,Female
3,4,22.0,6.74627,64.0,29.0,56.0,Female
4,5,18.0,6.821131,87.0,7.75,34.0,Female
5,6,19.0,7.095291,79.0,31.0,7.75,Male
6,7,20.0,2.620723,78.0,35.0,53.0,Female
7,8,20.0,4.88629,84.0,34.0,83.0,Male
8,9,28.5,5.110604,71.0,25.0,62.0,Female
9,10,17.0,5.424422,72.0,21.0,57.0,Male
10,11,23.0,9.664598,89.0,35.0,96.0,Male
11,12,18.0,7.404327,71.44186,42.0,98.0,Male
