AMAN VERMA

1753

FY AI-ML

# Student Performance Prediction â€“ Data Pre-Processing Focus

Objective: Apply Unit-1 concepts on a real dataset.

In [None]:
import pandas as pd
# Create the dataset
data = {
    "Student": ["S1", "S2", "S3", "S4", "S5"],
    "Gender": ["Male", "Female", "Male", "Female", "Male"],
    "Marks": [65, 70, None, 85, 90],
    "Attendance": [75, None, 80, 90, 95],
    "StudyHours": [2, 3, 1, 4, 5]
}
# Create DataFrame
df = pd.DataFrame(data)
# Save to CSV
df.to_csv("students.csv", index=False)
df

Unnamed: 0,Student,Gender,Marks,Attendance,StudyHours
0,S1,Male,65.0,75.0,2
1,S2,Female,70.0,,3
2,S3,Male,,80.0,1
3,S4,Female,85.0,90.0,4
4,S5,Male,90.0,95.0,5


##1. Identify missing values

In [None]:
# Check for missing (null) values in each column
missing_values = df.isnull().sum()

print(missing_values)

Student       0
Gender        0
Marks         1
Attendance    1
StudyHours    0
dtype: int64


## 2. Apply mean/median imputation

In [None]:
from sklearn.impute import SimpleImputer

# Create an imputer to replace missing values with the mean
mean_imputer = SimpleImputer(strategy="mean")

# Apply mean imputation to Attendance and StudyHours
df[["Attendance", "StudyHours"]] = mean_imputer.fit_transform(
    df[["Attendance", "StudyHours"]]
)

# Create an imputer to replace missing values with the median
median_imputer = SimpleImputer(strategy="median")

# Apply median imputation to Marks
df[["Marks"]] = median_imputer.fit_transform(df[["Marks"]])

# Display the updated dataset
print(df)


  Student  Gender  Marks  Attendance  StudyHours
0      S1    Male   65.0        75.0         2.0
1      S2  Female   70.0        85.0         3.0
2      S3    Male   77.5        80.0         1.0
3      S4  Female   85.0        90.0         4.0
4      S5    Male   90.0        95.0         5.0


## 3. Encode categorical variables

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize Label Encoder
le = LabelEncoder()

# Convert Gender column into numerical values
df["Gender"] = le.fit_transform(df["Gender"])

# Display encoded dataset
print(df)


  Student  Gender  Marks  Attendance  StudyHours
0      S1       1   65.0        75.0         2.0
1      S2       0   70.0        85.0         3.0
2      S3       1   77.5        80.0         1.0
3      S4       0   85.0        90.0         4.0
4      S5       1   90.0        95.0         5.0


## 4. Apply feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize Standard Scaler
scaler = StandardScaler()

# Select numerical columns for scaling
numerical_cols = ["Marks", "Attendance", "StudyHours"]

# Apply standardization (mean = 0, std = 1)
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Display scaled dataset
print(df)


  Student  Gender     Marks  Attendance  StudyHours
0      S1       1 -1.355815   -1.414214   -0.707107
1      S2       0 -0.813489    0.000000    0.000000
2      S3       1  0.000000   -0.707107   -1.414214
3      S4       0  0.813489    0.707107    0.707107
4      S5       1  1.355815    1.414214    1.414214


## 5. Detect and handle outliers

In [None]:
import numpy as np

# Loop through each numerical column
for col in numerical_cols:

    # Calculate first and third quartiles
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)

    # Calculate Interquartile Range (IQR)
    IQR = Q3 - Q1

    # Define lower and upper limits
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Cap the outliers within the IQR bounds
    df[col] = np.clip(df[col], lower_bound, upper_bound)

# Display dataset after handling outliers
print(df)


  Student  Gender     Marks  Attendance  StudyHours
0      S1       1 -1.355815   -1.414214   -0.707107
1      S2       0 -0.813489    0.000000    0.000000
2      S3       1  0.000000   -0.707107   -1.414214
3      S4       0  0.813489    0.707107    0.707107
4      S5       1  1.355815    1.414214    1.414214


## 6. Perform train-test split

In [None]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target variable (y)
X = df.drop(columns=["Marks", "Student"])
y = df["Marks"]

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Display the size of training and testing datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


X_train shape: (4, 3)
X_test shape: (1, 3)
