# 1. Import necessary libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer

# 2. Load dataset

In [4]:
df = pd.read_csv('student_performance_dataset.csv')  

# 3. Explore the data

In [6]:
print(df.head())
print(df.info())
print(df.describe())
print(df.isnull().sum())  

  Student_ID  Gender  Study_Hours_per_Week  Attendance_Rate  Past_Exam_Scores  \
0       S147    Male                    31        68.267841                86   
1       S136    Male                    16        78.222927                73   
2       S209  Female                    21        87.525096                74   
3       S458  Female                    27        92.076483                99   
4       S078  Female                    37        98.655517                63   

  Parental_Education_Level Internet_Access_at_Home Extracurricular_Activities  \
0              High School                     Yes                        Yes   
1                      PhD                      No                         No   
2                      PhD                     Yes                         No   
3                Bachelors                      No                         No   
4                  Masters                      No                        Yes   

   Final_Exam_Score Pass_F

# 4. Handle missing values

In [8]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

## Impute numerical columns with mean

In [10]:
num_imputer = SimpleImputer(strategy='mean')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

## Impute categorical columns with most frequent

In [12]:
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# 5. Encode categorical variables

In [14]:
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# 6. Feature Scaling

In [16]:
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# 7. Split dataset into features (X) and target (y)

In [18]:
X = df.drop('Study_Hours_per_Week', axis=1)  
y = df['Study_Hours_per_Week']

# 8. Train-test split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 9. Final check

In [22]:
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

Training data shape: (566, 509)
Testing data shape: (142, 509)
