# 🎓 Student Performance Analysis and Prediction (Theory & Practical)
### Exploratory Data Analysis and Visualization (U21ADP05)
**Student:** Surya J  |  **Roll No:** 23AD059  |  **Department:** AI & DS

This notebook explores a student performance dataset through EDA, visualization, and modeling using a Multilayer Perceptron (MLP) neural network.

In [None]:
# ==============================================================
# STUDENT PERFORMANCE ANALYSIS AND PREDICTION PROJECT
# Exploratory Data Analysis and Visualization (U21ADP05)
# Author: Surya J | Roll No: 23AD059 | Dept: AI & DS | Year/Sem: V / Odd Sem
# ==============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load dataset
df = pd.read_csv('student_performance.csv')
print('✅ Dataset Loaded! Shape:', df.shape)
df.head()

In [None]:
# Basic info and summary
df.info()
df.describe(include='all').T

In [None]:
# Check missing values and duplicates
print('Missing Values:\n', df.isna().sum())
print('\nDuplicate Rows:', df.duplicated().sum())
df = df.fillna(df.median(numeric_only=True))
df = df.drop_duplicates().reset_index(drop=True)

In [None]:
# Select target column
target_col = 'theory_score'
if target_col not in df.columns:
    target_col = df.select_dtypes(include=[np.number]).columns[-1]
y = df[target_col]
X = df.drop(columns=[target_col])

In [None]:
# Visualizations
sns.set(style='whitegrid', palette='muted')

# 1. Histogram
sns.histplot(df[target_col], kde=True)
plt.title('Distribution of Theory Scores')
plt.show()

# 2. Boxplot by gender
if 'gender' in df.columns:
    sns.boxplot(x='gender', y=target_col, data=df)
    plt.title('Theory Score by Gender')
    plt.show()

# 3. Correlation heatmap
sns.heatmap(df.select_dtypes(include=[np.number]).corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# 4. Pairplot
num_cols = df.select_dtypes(include=[np.number]).columns[:5]
sns.pairplot(df[num_cols])
plt.suptitle('Pairwise Relationships', y=1.02)
plt.show()

# 5. Scatter Plot
if 'practical_score' in df.columns:
    sns.scatterplot(x='practical_score', y='theory_score', data=df)
    plt.title('Theory vs Practical Scores')
    plt.show()

In [None]:
# Preprocessing
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if target_col in num_cols:
    num_cols.remove(target_col)

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])

X_prepared = preprocessor.fit_transform(X)
print('✅ Preprocessing Complete! Shape:', X_prepared.shape)

In [None]:
# Split data
X_train_val, X_test, y_train_val, y_test = train_test_split(X_prepared, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1765, random_state=42)
print('Train:', X_train.shape, 'Val:', X_val.shape, 'Test:', X_test.shape)

In [None]:
# Build MLP Model
def build_mlp(input_dim, lr=0.001, hidden=[128,64], dropout=0.2):
    model = keras.Sequential()
    model.add(layers.InputLayer(input_shape=(input_dim,)))
    for h in hidden:
        model.add(layers.Dense(h, activation='relu'))
        model.add(layers.Dropout(dropout))
    model.add(layers.Dense(1))
    model.compile(optimizer=keras.optimizers.Adam(lr), loss='mse',
                  metrics=[keras.metrics.RootMeanSquaredError(name='rmse'), keras.metrics.MeanAbsoluteError(name='mae')])
    return model

model = build_mlp(X_train.shape[1])
model.summary()

In [None]:
# Train model
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                    epochs=100, batch_size=32, callbacks=[early_stop], verbose=2)

In [None]:
# Evaluation plots
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Loss vs Epoch')
plt.legend()
plt.show()

plt.plot(history.history['mae'], label='Train MAE')
plt.plot(history.history['val_mae'], label='Val MAE')
plt.title('MAE vs Epoch')
plt.legend()
plt.show()

In [None]:
# Test evaluation
y_pred = model.predict(X_test).flatten()
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'📊 MSE: {mse:.3f} | RMSE: {rmse:.3f} | MAE: {mae:.3f} | R2: {r2:.3f}')

sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel('Actual'); plt.ylabel('Predicted')
plt.title('Actual vs Predicted')
plt.show()

sns.histplot(y_test - y_pred, kde=True)
plt.title('Error Distribution')
plt.show()

In [None]:
# Save model
model.save('mlp_student_performance_model.h5')
print('✅ Model Saved Successfully!')