**1. Dataset Understanding & EDA**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
try:
    df = pd.read_csv('student-mat.csv', sep=';')
    print("--- Dataset loaded successfully! ---")
except FileNotFoundError:
    print("Error: 'student-mat.csv' not found.")
    print("Please download the dataset and place it in the same directory.")
    exit()

In [None]:
# Initial Data Overview
print("\n--- Initial Data Overview ---")
print("First 5 Rows:")
print(df.head())
print("\nDataset Info:")
df.info()
print("\nSummary Statistics:")
print(df.describe())

In [None]:
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())
# EDA Visualizations
print("\n--- Performing Exploratory Data Analysis (EDA) ---")

In [None]:
# Distribution of the final grade (G3)
plt.figure(figsize=(10, 6))
sns.histplot(df['G3'], kde=True, bins=20)
plt.title('Distribution of Final Grades (G3)')
plt.xlabel('Final Grade')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Correlation Heatmap
# Select only numeric columns for correlation matrix
numeric_cols = df.select_dtypes(include=np.number).columns
plt.figure(figsize=(18, 15))
sns.heatmap(df[numeric_cols].corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap of Numeric Features')
plt.show()
print("\n--- Starting Data Preprocessing ---")

**2. Data Preprocessing**

In [None]:
# Create a copy to avoid modifying the original dataframe
df_processed = df.copy()
# Encode Categorical Variables using one-hot encoding
# We select object type columns to encode
categorical_cols = df_processed.select_dtypes(include=['object']).columns
df_processed = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=True)
print("Shape of dataframe after one-hot encoding:", df_processed.shape)
print("First 5 rows of preprocessed data:")
print(df_processed.head())

In [None]:
# Create target variables for regression and classification
# For classification, we define 'pass' as G3 >= 10 (a common passing grade)
df_processed['pass_fail'] = (df_processed['G3'] >= 10).astype(int)
# Separate features (X) and target variables (y)
X = df_processed.drop(['G3', 'pass_fail'], axis=1)
y_reg = df_processed['G3'] # Target for regression (predicting the grade)
y_class = df_processed['pass_fail'] # Target for classification (predicting pass/fail)
# Import necessary libraries for modeling and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Scale numerical features
# Note: One-hot encoded columns are already 0/1, so we only scale original numeric features
# However, scaling the entire feature set is common practice and generally safe.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split data into training and testing sets for both tasks
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_scaled, y_reg, test_size=0.2, random_state=42)
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_scaled, y_class, test_size=0.2, random_state=42)
print("\n--- Data split and scaled successfully. Ready for model building. ---")

**3. Model Building & Evaluation: REGRESSION**

In [None]:
print("\n--- Task 1: REGRESSION (Predicting Final Grade) ---")
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# Multiple Linear Regression Model
mlr = LinearRegression()
mlr.fit(X_train_reg, y_train_reg)

In [None]:
# Make predictions
y_pred_reg = mlr.predict(X_test_reg)
# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))
mae = mean_absolute_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)
print("\nMultiple Linear Regression Results:")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R²) Score: {r2:.2f}")

In [None]:
# Visualization: Actual vs. Predicted Grades
plt.figure(figsize=(10, 6))
plt.scatter(y_test_reg, y_pred_reg, alpha=0.7)
plt.plot([0, 20], [0, 20], '--', color='red', lw=2) # Line for perfect prediction
plt.title('Actual vs. Predicted Final Grades')
plt.xlabel('Actual Grades')
plt.ylabel('Predicted Grades')
plt.grid(True)
plt.show()

In [None]:
print("\n--- Task 2: CLASSIFICATION (Predicting Pass/Fail) ---")
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# --- Logistic Regression Model ---
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_class, y_train_class)
y_pred_log_reg = log_reg.predict(X_test_class)
# --- Decision Tree Model ---
dec_tree = DecisionTreeClassifier(random_state=42)
dec_tree.fit(X_train_class, y_train_class)
y_pred_dec_tree = dec_tree.predict(X_test_class)
# Function to evaluate classification models
def evaluate_classifier(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f"\n--- {model_name} Results ---")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")

    # Plotting Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Fail', 'Pass'], yticklabels=['Fail', 'Pass'])
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
# Evaluate both models
evaluate_classifier(y_test_class, y_pred_log_reg, "Logistic Regression")
evaluate_classifier(y_test_class, y_pred_dec_tree, "Decision Tree")

4. Insights & Key Factors

In [None]:
print("\n--- Identifying Key Factors for Student Performance ---")
# For regression, we can check the coefficients of the linear model
# But with many one-hot encoded features, Decision Tree feature importance is more intuitive.
feature_importances = pd.DataFrame({
    'feature': X.columns,
    'importance': dec_tree.feature_importances_
}).sort_values('importance', ascending=False)
print("\nTop 10 Most Important Features (from Decision Tree):")
print(feature_importances.head(10))


In [None]:
from sklearn.model_selection import cross_val_score
print("\n--- Performing Cross-Validation for Decision Tree ---")
dec_tree_cv = DecisionTreeClassifier(random_state=42)
cv_scores = cross_val_score(dec_tree_cv, X_scaled, y_class, cv=5, scoring='accuracy')
print(f"Accuracy scores for each of the 5 folds: {cv_scores}")
print(f"Average cross-validation accuracy: {cv_scores.mean():.2f}")