In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
df = pd.read_excel(r'C:\Users\swetha\Downloads\DATA SCIENTIST\Student marks\Task____students_performance_dataset.xlsx')
print("Dataset loaded successfully")
print(f"Shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())

Dataset loaded successfully
Shape: (300, 20)

First few rows:
  Student_ID  Gender  Study_Hours_per_Week  Attendance_Percentage  \
0      S1000    Male             36.330635              86.628815   
1      S1001  Female              9.582476              73.504255   
2      S1002    Male              5.795795              65.453321   
3      S1003    Male             19.578110              94.381263   
4      S1004    Male             39.426018              46.671849   

   Previous_Sem_Score Parental_Education Internet_Access  Family_Income  \
0           38.182863       Postgraduate              No   32727.500387   
1           95.788249        High School              No   42332.567312   
2           73.939564        High School             Yes   44142.273093   
3           53.443393           Graduate             Yes   64498.434188   
4           39.749045       Postgraduate              No   42105.884774   

  Tutoring_Classes Sports_Activity Extra_Curricular School_Type  Sleep_H

In [None]:
print("\nColumn names:")
print(df.columns.tolist())
print(f"\nData types:\n{df.dtypes}")
print(f"\nMissing values:\n{df.isnull().sum()}")


Column names:
['Student_ID', 'Gender', 'Study_Hours_per_Week', 'Attendance_Percentage', 'Previous_Sem_Score', 'Parental_Education', 'Internet_Access', 'Family_Income', 'Tutoring_Classes', 'Sports_Activity', 'Extra_Curricular', 'School_Type', 'Sleep_Hours', 'Travel_Time', 'Test_Anxiety_Level', 'Peer_Influence', 'Teacher_Feedback', 'Motivation_Level', 'Library_Usage_per_Week', 'Final_Score']

Data types:
Student_ID                 object
Gender                     object
Study_Hours_per_Week      float64
Attendance_Percentage     float64
Previous_Sem_Score        float64
Parental_Education         object
Internet_Access            object
Family_Income             float64
Tutoring_Classes           object
Sports_Activity            object
Extra_Curricular           object
School_Type                object
Sleep_Hours               float64
Travel_Time               float64
Test_Anxiety_Level        float64
Peer_Influence            float64
Teacher_Feedback           object
Motivation_Leve

In [None]:
df = df.drop('Student_ID', axis=1)
print("Student_ID column dropped")
print(f"Remaining shape: {df.shape}")

Student_ID column dropped
Remaining shape: (300, 19)


In [None]:
categorical_cols = ['Gender', 'Parental_Education', 'Internet_Access', 'Tutoring_Classes',
                    'Sports_Activity', 'Extra_Curricular', 'School_Type', 'Teacher_Feedback']

numeric_cols = ['Study_Hours_per_Week', 'Attendance_Percentage', 'Previous_Sem_Score',
                'Family_Income', 'Sleep_Hours', 'Travel_Time', 'Test_Anxiety_Level',
                'Peer_Influence', 'Motivation_Level', 'Library_Usage_per_Week']

print(f"Categorical columns: {len(categorical_cols)}")
print(f"Numeric columns: {len(numeric_cols)}")

Categorical columns: 8
Numeric columns: 10


In [None]:
label_encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])
print("Categorical columns encoded")

Categorical columns encoded


In [None]:
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
print("Numeric columns scaled")
print(f"\nScaled data sample:")
print(df.head())

Numeric columns scaled

Scaled data sample:
   Gender  Study_Hours_per_Week  Attendance_Percentage  Previous_Sem_Score  \
0       1              1.357920               0.975685           -1.269179   
1       0             -0.904118               0.223635            1.551475   
2       1             -1.224350              -0.237692            0.481652   
3       1             -0.058807               1.419909           -0.521946   
4       1              1.619691              -1.313890           -1.192491   

   Parental_Education  Internet_Access  Family_Income  Tutoring_Classes  \
0                   2                0      -0.878555                 0   
1                   1                0      -0.491311                 0   
2                   1                1      -0.418350                 0   
3                   0                1       0.402342                 0   
4                   2                0      -0.500450                 0   

   Sports_Activity  Extra_Curricular

In [None]:
X = df.drop('Final_Score', axis=1)
y = df['Final_Score']
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"\nFeature columns: {list(X.columns)}")

X shape: (300, 18)
y shape: (300,)

Feature columns: ['Gender', 'Study_Hours_per_Week', 'Attendance_Percentage', 'Previous_Sem_Score', 'Parental_Education', 'Internet_Access', 'Family_Income', 'Tutoring_Classes', 'Sports_Activity', 'Extra_Curricular', 'School_Type', 'Sleep_Hours', 'Travel_Time', 'Test_Anxiety_Level', 'Peer_Influence', 'Teacher_Feedback', 'Motivation_Level', 'Library_Usage_per_Week']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

Training set: 240 samples
Test set: 60 samples


In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
print("Model trained successfully")

Model trained successfully


In [None]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("Training Performance:")
print(f"MAE: {mean_absolute_error(y_train, y_train_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_train, y_train_pred)):.4f}")
print(f"R²: {r2_score(y_train, y_train_pred):.4f}")

print("\nTest Performance:")
print(f"MAE: {mean_absolute_error(y_test, y_test_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred)):.4f}")
print(f"R²: {r2_score(y_test, y_test_pred):.4f}")

Training Performance:
MAE: 4.0420
RMSE: 4.9818
R²: 0.7505

Test Performance:
MAE: 5.1840
RMSE: 6.3486
R²: 0.5706


In [None]:
print("Sample Predictions:")
print("-" * 50)
sample_indices = [0, 50, 100, 150, 200]
for idx in sample_indices:
    if idx < len(y_test):
        pred = y_test_pred[idx]
        actual = y_test.iloc[idx]
        print(f"Actual: {actual:.2f} | Predicted: {pred:.2f} | Error: {abs(actual-pred):.2f}")

Sample Predictions:
--------------------------------------------------
Actual: 55.90 | Predicted: 44.69 | Error: 11.21
Actual: 40.09 | Predicted: 32.76 | Error: 7.33


In [None]:
joblib.dump(model, "student_model.pkl")
joblib.dump(scaler, "student_scaler.pkl")
joblib.dump(X.columns, "student_feature_order.pkl")
joblib.dump(numeric_cols, "student_numeric_columns.pkl")
print("Model and preprocessing objects saved successfully!")

Model and preprocessing objects saved successfully!
