In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('data/stud.csv')

In [5]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [8]:
df.isna().sum()

gender                         0
race_ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
dtype: int64

In [7]:
df.duplicated().sum()

0

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race_ethnicity               1000 non-null   object
 2   parental_level_of_education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test_preparation_course      1000 non-null   object
 5   math_score                   1000 non-null   int64 
 6   reading_score                1000 non-null   int64 
 7   writing_score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [4]:
df['gender'].value_counts()

gender
female    518
male      482
Name: count, dtype: int64

In [10]:
print("gender catogories",df['gender'].unique())
print("race catogories",df['race_ethnicity'].unique())
print("parents edu catogories",df['parental_level_of_education'].unique())
print("lunch catogories",df['lunch'].unique())
print("test preparation catogories",df['test_preparation_course'].unique())


gender catogories ['female' 'male']
race catogories ['group B' 'group C' 'group A' 'group D' 'group E']
parents edu catogories ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
lunch catogories ['standard' 'free/reduced']
test preparation catogories ['none' 'completed']


In [5]:
numerical_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

print(f"no of numerical features {numerical_features}")
print("no of categorical features [{0}]".format(categorical_features))

no of numerical features ['math_score', 'reading_score', 'writing_score']
no of categorical features [['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']]


In [12]:
df['total_score'] = df['math_score'] + df['reading_score'] + df['writing_score']
df['average'] = df['total_score']/3
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total_score,average
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,group C,some college,standard,none,76,78,75,229,76.333333


In [13]:
toper = df[df['reading_score'] == 100]['average'].count()
toper

17

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

cat_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy='most_frequent')),
        ("OneHotEncoder", OneHotEncoder()),
        ("scaler",  StandardScaler(with_mean=False))                     # now dense → no error
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipeline", num_pipe, numerical_features),
        ("cat_pipeline", cat_pipe, categorical_features)
    ]
)

t = preprocessor.fit_transform(df)


In [18]:
pd.DataFrame(t,columns=preprocessor.get_feature_names_out())

Unnamed: 0,num_pipeline__math_score,num_pipeline__reading_score,num_pipeline__writing_score,cat_pipeline__gender_female,cat_pipeline__gender_male,cat_pipeline__race_ethnicity_group A,cat_pipeline__race_ethnicity_group B,cat_pipeline__race_ethnicity_group C,cat_pipeline__race_ethnicity_group D,cat_pipeline__race_ethnicity_group E,cat_pipeline__parental_level_of_education_associate's degree,cat_pipeline__parental_level_of_education_bachelor's degree,cat_pipeline__parental_level_of_education_high school,cat_pipeline__parental_level_of_education_master's degree,cat_pipeline__parental_level_of_education_some college,cat_pipeline__parental_level_of_education_some high school,cat_pipeline__lunch_free/reduced,cat_pipeline__lunch_standard,cat_pipeline__test_preparation_course_completed,cat_pipeline__test_preparation_course_none
0,0.390024,0.193999,0.391492,2.001297,0.000000,0.00000,2.549064,0.000000,0.000000,0.000000,0.000000,3.099736,0.000000,0.000000,0.000000,0.0,0.000000,2.089806,0.000000,2.085888
1,0.192076,1.427476,1.313269,2.001297,0.000000,0.00000,0.000000,2.145513,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.390976,0.0,0.000000,2.089806,2.085888,0.000000
2,1.577711,1.770109,1.642475,2.001297,0.000000,0.00000,2.549064,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,4.244037,0.000000,0.0,0.000000,2.089806,0.000000,2.085888
3,-1.259543,-0.833899,-1.583744,0.000000,2.001297,3.51193,0.000000,0.000000,0.000000,0.000000,2.406211,0.000000,0.000000,0.000000,0.000000,0.0,2.089806,0.000000,0.000000,2.085888
4,0.653954,0.605158,0.457333,0.000000,2.001297,0.00000,0.000000,2.145513,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.390976,0.0,0.000000,2.089806,0.000000,2.085888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1.445746,2.044215,1.774157,2.001297,0.000000,0.00000,0.000000,0.000000,0.000000,2.881952,0.000000,0.000000,0.000000,4.244037,0.000000,0.0,0.000000,2.089806,2.085888,0.000000
996,-0.269803,-0.970952,-0.859491,0.000000,2.001297,0.00000,0.000000,2.145513,0.000000,0.000000,0.000000,0.000000,2.519091,0.000000,0.000000,0.0,2.089806,0.000000,0.000000,2.085888
997,-0.467751,0.125472,-0.201079,2.001297,0.000000,0.00000,0.000000,2.145513,0.000000,0.000000,0.000000,0.000000,2.519091,0.000000,0.000000,0.0,2.089806,0.000000,2.085888,0.000000
998,0.126093,0.605158,0.589015,2.001297,0.000000,0.00000,0.000000,0.000000,2.274161,0.000000,0.000000,0.000000,0.000000,0.000000,2.390976,0.0,0.000000,2.089806,2.085888,0.000000
