In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Evaluation
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [4]:
# If it's a regular CSV
df = pd.read_csv('/StudentPerformanceFactors.csv')

# If it's like the UCI dataset (semicolon-separated)
# df = pd.read_csv('student-mat.csv', sep=';')

print(df.head())

   Hours_Studied  Attendance Parental_Involvement Access_to_Resources  \
0             23          84                  Low                High   
1             19          64                  Low              Medium   
2             24          98               Medium              Medium   
3             29          89                  Low              Medium   
4             19          92               Medium              Medium   

  Extracurricular_Activities  Sleep_Hours  Previous_Scores Motivation_Level  \
0                         No            7               73              Low   
1                         No            8               59              Low   
2                        Yes            7               91           Medium   
3                        Yes            8               98           Medium   
4                        Yes            6               65           Medium   

  Internet_Access  Tutoring_Sessions Family_Income Teacher_Quality  \
0             Ye

In [5]:
# See column names, non-null counts, and data types (e.g., object, int64)
print(df.info())

# Get statistical summary for numerical columns (count, mean, std, etc.)
print(df.describe())

# Check for missing values in each column
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6607 non-null   int64 
 1   Attendance                  6607 non-null   int64 
 2   Parental_Involvement        6607 non-null   object
 3   Access_to_Resources         6607 non-null   object
 4   Extracurricular_Activities  6607 non-null   object
 5   Sleep_Hours                 6607 non-null   int64 
 6   Previous_Scores             6607 non-null   int64 
 7   Motivation_Level            6607 non-null   object
 8   Internet_Access             6607 non-null   object
 9   Tutoring_Sessions           6607 non-null   int64 
 10  Family_Income               6607 non-null   object
 11  Teacher_Quality             6529 non-null   object
 12  School_Type                 6607 non-null   object
 13  Peer_Influence              6607 non-null   obje

In [8]:
# Clean the single data-entry error (score > 100)
df_cleaned = df[df['Exam_Score'] <= 100].copy()

# Define features (X) and target (y)
X = df_cleaned.drop('Exam_Score', axis=1)
y = df_cleaned['Exam_Score']

In [9]:
# Identify numerical and categorical columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

print(f"Numerical Features: {list(numerical_features)}")
print(f"Categorical Features: {list(categorical_features)}")

Numerical Features: ['Hours_Studied', 'Attendance', 'Sleep_Hours', 'Previous_Scores', 'Tutoring_Sessions', 'Physical_Activity']
Categorical Features: ['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities', 'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality', 'School_Type', 'Peer_Influence', 'Learning_Disabilities', 'Parental_Education_Level', 'Distance_from_Home', 'Gender']


In [10]:
# Create a pipeline for numerical features
# (We use StandardScaler to put all numbers on the same scale)
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Create a pipeline for categorical features
# 1. SimpleImputer: Fills missing data (like in 'Teacher_Quality') with the most frequent value
# 2. OneHotEncoder: Converts text ('Gender', 'School_Type', etc.) into 0s and 1s
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine these pipelines into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [11]:
# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply the preprocessor
# .fit_transform() learns from the training data
X_train_processed = preprocessor.fit_transform(X_train)

# .transform() applies the *same* rules to the test data
X_test_processed = preprocessor.transform(X_test)

# Check the results. The number of columns will be much larger after one-hot encoding!
print(f"Original X_train shape: {X_train.shape}")
print(f"Processed X_train shape: {X_train_processed.shape}")
print(f"Original X_test shape: {X_test.shape}")
print(f"Processed X_test shape: {X_test_processed.shape}")

Original X_train shape: (5284, 19)
Processed X_train shape: (5284, 40)
Original X_test shape: (1322, 19)
Processed X_test shape: (1322, 40)


In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 1. Initialize the model
linear_model = LinearRegression()

# 2. Train the model on the processed training data
print("Training Linear Regression model...")
linear_model.fit(X_train_processed, y_train)

print("Linear Regression trained!")

Training Linear Regression model...
Linear Regression trained!


In [13]:
from sklearn.ensemble import RandomForestRegressor

# 1. Initialize the model
# n_estimators=100 means it will build 100 decision trees
# random_state=42 ensures you get the same results every time you run it
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# 2. Train the model
# n_jobs=-1 tells Colab to use all available CPU cores to speed up training
print("\nTraining Random Forest model (this may take a moment)...")
rf_model.fit(X_train_processed, y_train)

print("Random Forest trained!")


Training Random Forest model (this may take a moment)...
Random Forest trained!


In [15]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

# 1. Make predictions with both models
y_pred_linear = linear_model.predict(X_test_processed)
y_pred_rf = rf_model.predict(X_test_processed)

# 2. Evaluate Linear Regression
print("\n--- Linear Regression Results ---")
mse_linear = mean_squared_error(y_test, y_pred_linear)  # Calculate MSE
rmse_linear = np.sqrt(mse_linear)                      # Calculate RMSE from MSE
r2_linear = r2_score(y_test, y_pred_linear)

print(f"Mean Squared Error (MSE): {mse_linear:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_linear:.2f}")
print(f"R-squared (R2): {r2_linear:.2f}")

# 3. Evaluate Random Forest
print("\n--- Random Forest Results ---")
mse_rf = mean_squared_error(y_test, y_pred_rf)  # Calculate MSE
rmse_rf = np.sqrt(mse_rf)                      # Calculate RMSE from MSE
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Mean Squared Error (MSE): {mse_rf:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_rf:.2f}")
print(f"R-squared (R2): {r2_rf:.2f}")


--- Linear Regression Results ---
Mean Squared Error (MSE): 2.31
Root Mean Squared Error (RMSE): 1.52
R-squared (R2): 0.82

--- Random Forest Results ---
Mean Squared Error (MSE): 3.87
Root Mean Squared Error (RMSE): 1.97
R-squared (R2): 0.71


In [16]:
import pandas as pd

# 1. Get all the feature names from the preprocessor
# -----------------------------------------------------------------
# Get the numerical feature names (they don't change)
num_features = numerical_features.tolist()

# Get the new one-hot encoded categorical feature names
cat_features = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)
cat_features = cat_features.tolist()

# Combine them in the same order the model saw them
all_feature_names = num_features + cat_features


# 2. Create a DataFrame of features and their coefficients
# -----------------------------------------------------------------
# Get the coefficients from the trained linear model
coefficients = linear_model.coef_

# Create a new DataFrame to see them clearly
feature_importance = pd.DataFrame({
    'Feature': all_feature_names,
    'Coefficient': coefficients
})

# Calculate the absolute value of the coefficient for sorting
feature_importance['Absolute_Coefficient'] = feature_importance['Coefficient'].abs()

# Sort by the absolute value to see the most impactful features
feature_importance = feature_importance.sort_values(by='Absolute_Coefficient', ascending=False)


# 3. Display the results
# -----------------------------------------------------------------
print("--- Most Impactful Features (Top 10) ---")
print(feature_importance.head(10))

print("\n--- Top 10 Features that INCREASE Exam Score ---")
print(feature_importance.sort_values(by='Coefficient', ascending=False).head(10))

print("\n--- Top 10 Features that DECREASE Exam Score ---")
print(feature_importance.sort_values(by='Coefficient', ascending=True).head(10))

--- Most Impactful Features (Top 10) ---
                      Feature  Coefficient  Absolute_Coefficient
1                  Attendance     2.284465              2.284465
0               Hours_Studied     1.743948              1.743948
10    Access_to_Resources_Low    -1.050347              1.050347
9    Access_to_Resources_High     1.044108              1.044108
6   Parental_Involvement_High     1.034178              1.034178
7    Parental_Involvement_Low    -0.985444              0.985444
3             Previous_Scores     0.703278              0.703278
4           Tutoring_Sessions     0.594487              0.594487
14      Motivation_Level_High     0.546707              0.546707
15       Motivation_Level_Low    -0.533469              0.533469

--- Top 10 Features that INCREASE Exam Score ---
                      Feature  Coefficient  Absolute_Coefficient
1                  Attendance     2.284465              2.284465
0               Hours_Studied     1.743948              1.743948

In [17]:
import joblib

# 1. Save the Linear Regression model
joblib.dump(linear_model, 'student_performance_model.joblib')

# 2. Save the Preprocessor
joblib.dump(preprocessor, 'student_data_preprocessor.joblib')

print("Model and preprocessor saved successfully!")

Model and preprocessor saved successfully!


In [18]:
import joblib
import pandas as pd

# 1. Load the saved model and preprocessor
loaded_model = joblib.load('student_performance_model.joblib')
loaded_preprocessor = joblib.load('student_data_preprocessor.joblib')

print("Model and preprocessor loaded!")

# 2. Define a new student's data
# This must be a DataFrame because the preprocessor expects it
# I'm using the original 19 feature columns (before Exam_Score)
new_student_data = pd.DataFrame({
    'Hours_Studied': [18],
    'Attendance': [85],
    'Parental_Involvement': ['High'],
    'Access_to_Resources': ['Good'],
    'Extracurricular_Activities': ['Yes'],
    'Sleep_Hours': [7],
    'Previous_Scores': [82],
    'Motivation_Level': ['High'],
    'Internet_Access': ['Yes'],
    'Tutoring_Sessions': [1],
    'Family_Income': ['Medium'],
    'Teacher_Quality': ['Good'],
    'School_Type': ['Public'],
    'Peer_Influence': ['Positive'],
    'Physical_Activity': [3],
    'Learning_Disabilities': ['No'],
    'Parental_Education_Level': ["Bachelor's"],
    'Distance_from_Home': ['Short'],
    'Gender': ['Female']
})

# 3. Process the new data using the LOADED preprocessor
new_student_processed = loaded_preprocessor.transform(new_student_data)

# 4. Make a prediction with the LOADED model
prediction = loaded_model.predict(new_student_processed)

print(f"\nPredicted Exam Score for the new student: {prediction[0]:.2f}")

Model and preprocessor loaded!

Predicted Exam Score for the new student: 69.95
