In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [60]:
df=pd.read_csv('C:/Users/san/Downloads/StudentsPerformance.csv')

In [61]:
display(df.head(2))
display(df.shape)
display(df.info())
display(df.describe())


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88


(1000, 8)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


None

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [62]:
#Standardize column names for easier coding:d
    
df.columns = df.columns.str.lower().str.replace(" ", "_")
df.columns

Index(['gender', 'race/ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course', 'math_score', 'reading_score',
       'writing_score'],
      dtype='object')

In [63]:
#find avg score and from that try to find the performance level of each student --Feature engineering

df['average_score'] = df[['math_score', 'reading_score', 'writing_score']].mean(axis=1)

df['performance_level'] = pd.cut(df['average_score'],
                                 bins=[0, 60, 80, 100],
                                 labels=['Low', 'Medium', 'High'])

In [64]:
df.head(2)

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,average_score,performance_level
0,female,group B,bachelor's degree,standard,none,72,72,74,72.666667,Medium
1,female,group C,some college,standard,completed,69,90,88,82.333333,High


In [65]:
df=df.drop('average_score',axis=1)
df.head(2)

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,performance_level
0,female,group B,bachelor's degree,standard,none,72,72,74,Medium
1,female,group C,some college,standard,completed,69,90,88,High


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   gender                       1000 non-null   object  
 1   race/ethnicity               1000 non-null   object  
 2   parental_level_of_education  1000 non-null   object  
 3   lunch                        1000 non-null   object  
 4   test_preparation_course      1000 non-null   object  
 5   math_score                   1000 non-null   int64   
 6   reading_score                1000 non-null   int64   
 7   writing_score                1000 non-null   int64   
 8   performance_level            1000 non-null   category
dtypes: category(1), int64(3), object(5)
memory usage: 63.7+ KB


In [None]:
cat_cols=[feat for feat in df.columns if df[feat].dtypes!='int64']

for i in cat_cols:
    
    print(f"the unique values of {i} is {df[i].unique()}")

the unique values of gender is ['female' 'male']
the unique values of race/ethnicity is ['group B' 'group C' 'group A' 'group D' 'group E']
the unique values of parental_level_of_education is ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
the unique values of lunch is ['standard' 'free/reduced']
the unique values of test_preparation_course is ['none' 'completed']
the unique values of performance_level is ['Medium', 'High', 'Low']
Categories (3, object): ['Low' < 'Medium' < 'High']


In [68]:
y=df['performance_level']
X=df.drop('performance_level',axis=1)

In [69]:
Xtr,Xte,ytr,yte=train_test_split(X,y,test_size=0.25,random_state=42)

In [70]:
cat_cols=[feat for feat in df.columns if df[feat].dtypes=='O']
num_cols=[feat for feat in df.columns if df[feat].dtypes=='int64']
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(
    transformers=[('onehot', OneHotEncoder(handle_unknown='ignore'), cat_cols),
                  ('scaler', StandardScaler(), num_cols)],
    remainder='passthrough')

In [16]:
cat_cols

['gender',
 'race/ethnicity',
 'parental_level_of_education',
 'lunch',
 'test_preparation_course']

In [29]:
num_cols

['math_score', 'reading_score', 'writing_score']

In [71]:
from sklearn.pipeline import Pipeline
model = Pipeline([
    ('preprocess', ct),
    ('classifier',RandomForestClassifier())
])

In [72]:
model.fit(Xtr,ytr)
ypred=model.predict(Xte)

acc_score=accuracy_score(yte,ypred)
print(f"acc score {acc_score}")
print(f"classification report {classification_report(yte,ypred)}")

acc score 0.988
classification report               precision    recall  f1-score   support

        High       1.00      1.00      1.00        39
         Low       0.96      1.00      0.98        78
      Medium       1.00      0.98      0.99       133

    accuracy                           0.99       250
   macro avg       0.99      0.99      0.99       250
weighted avg       0.99      0.99      0.99       250



In [73]:
test_data = pd.DataFrame({
'gender' : ['Female','Male','Male','Female','Male'],
'race/ethnicity' : ['group B','group C','group A','group D','group E'], 
'parental_level_of_education' : ["bachelor's degree", 'some college', "master's degree" ,"associate's degree",'high school'],
'lunch' : ['standard','free/reduced','standard','free/reduced','standard'],
'test_preparation_course': ['none','completed','completed','none','completed'],
'math_score' : [45,78,23,98,67],
'reading_score' : [34,56,78,90,86],
'writing_score' : [65,75,83,25,97],

})
test_data

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,Female,group B,bachelor's degree,standard,none,45,34,65
1,Male,group C,some college,free/reduced,completed,78,56,75
2,Male,group A,master's degree,standard,completed,23,78,83
3,Female,group D,associate's degree,free/reduced,none,98,90,25
4,Male,group E,high school,standard,completed,67,86,97


In [74]:
ypred1=model.predict(test_data)
print(f" the performance values for the above given test data are: {ypred1}")

 the performance values for the above given test data are: ['Low' 'Medium' 'Medium' 'Medium' 'High']


In [21]:
import joblib

# 💾 Save the trained pipeline (preprocessing + model)
joblib.dump(model, 'stud_model1.joblib')
print("✅ Model saved!")

✅ Model saved!


In [22]:
# 📦 Load the saved model
loaded_model = joblib.load('stud_model1.joblib')

In [None]:
test_data1= pd.DataFrame({
'gender' : ['Female'],
'race/ethnicity' : ['group B'], 
'parental_level_of_education' : ["bachelor's degree"],
'lunch' : ['standard'],
'test_preparation_course': ['completed'],
'math_score' : [67],
'reading_score' : [34],
'writing_score' : [97],

})

In [24]:
prediction=loaded_model.predict(test_data1)

print(f"prediction from loaded model {prediction}")

prediction from loaded model ['Medium']


In [25]:
import pickle

# 🎯 Save model as .pkl
with open('stud_modelpkl.pkl', 'wb') as f:
    pickle.dump(model, f)

print("✅ Model saved as .pkl!")

✅ Model saved as .pkl!


In [26]:
# 📦 Load the saved model
with open('stud_modelpkl.pkl', 'rb') as file:
    loaded_model_pickl = pickle.load(file)

In [27]:
test_data2= pd.DataFrame({
'gender' : ['male'],
'race/ethnicity' : ['group D'], 
'parental_level_of_education' : ["bachelor's degree"],
'lunch' : ['standard'],
'test_preparation_course': ['completed'],
'math_score' : [78],
'reading_score' : [79],
'writing_score' : [97],

})

In [28]:
prediction=loaded_model.predict(test_data2)

print(f"prediction from loaded model {prediction}")

prediction from loaded model ['Medium']
