In [1]:
import pandas as pd

df = pd.read_csv('StudentsPerformance.csv')

df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [2]:
df.tail()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77
999,female,group D,some college,free/reduced,none,77,86,86


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [4]:
df.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [None]:
# Checking missing values
df.isnull().sum() 

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

In [8]:
df['gender'].value_counts()

gender
female    518
male      482
Name: count, dtype: int64

In [None]:
# Create lables

# 1. For regression

df['average_score'] = df[['math score', 'reading score', 'writing score']].mean(axis=1)
df.drop(['math score', 'reading score', 'writing score'], axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gender                       1000 non-null   object 
 1   race/ethnicity               1000 non-null   object 
 2   parental level of education  1000 non-null   object 
 3   lunch                        1000 non-null   object 
 4   test preparation course      1000 non-null   object 
 5   average_score                1000 non-null   float64
dtypes: float64(1), object(5)
memory usage: 47.0+ KB


In [None]:
# 2. For classification
df["pass"] = (df["average_score"] >= 50).astype(int)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gender                       1000 non-null   object 
 1   race/ethnicity               1000 non-null   object 
 2   parental level of education  1000 non-null   object 
 3   lunch                        1000 non-null   object 
 4   test preparation course      1000 non-null   object 
 5   average_score                1000 non-null   float64
 6   pass                         1000 non-null   int32  
dtypes: float64(1), int32(1), object(5)
memory usage: 50.9+ KB


In [47]:
# Spliting into x and y  for regression
X = df.drop(["average_score", "pass"], axis=1)
y_lin = df["average_score"]
y_class = df["pass"]


In [48]:
# TRain- Test Split
from sklearn.model_selection import train_test_split

X_lin_train, X_lin_test, y_lin_train, y_lin_test = train_test_split(X, y_lin, test_size=0.3, random_state=42)

X_class_train, X_class_test, y_class_train, y_class_test = train_test_split(X, y_class, test_size=0.3, random_state=42)


In [49]:
# Bulding pipline 
char_fetures = X.select_dtypes(include=['object', 'string']).columns
num_fetures = X.select_dtypes(include=['int64', 'float64']).columns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
char_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# combine them 
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_fetures),
    ('char', char_pipeline, char_fetures)
])

X_lin_test_prepared = full_pipeline.fit_transform(X_lin_test)
X_lin_train_prepared = full_pipeline.fit_transform(X_lin_train)

X_class_train_prepared = full_pipeline.fit_transform(X_class_train)
X_class_test_train = full_pipeline.fit_transform(X_class_test)

In [51]:
# Training the Model
# 1. regression
from sklearn.linear_model import LinearRegression
reg_model = LinearRegression()
reg_model.fit(X_lin_train_prepared, y_lin_train)

y_lin_pred = reg_model.predict(X_lin_test_prepared)

In [52]:
# Evaluate Regression Model
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

mse_lin = mean_squared_error(y_lin_test, y_lin_pred)
rmse_lin = np.sqrt(mse)
mae_lin = mean_absolute_error(y_lin_test, y_lin_pred)

print(f"RMSE: {rmse_lin:.2f}")
print(f"MAE: {mae_lin:.2f}")


RMSE: 13.27
MAE: 10.57


In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

# For regression
scores = cross_val_score(reg_model, X_lin_train_prepared, y_lin_train, cv=5, scoring="accuracy") # type: ignore
print('\n')
print("Cross-validation scores:", scores)
print("Mean Accuracy:", scores.mean())
print("Standard Deviation:", scores.std())



Cross-validation scores: [nan nan nan nan nan]
Mean Accuracy: nan
Standard Deviation: nan


Traceback (most recent call last):
  File "c:\Users\yuvra\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 139, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "c:\Users\yuvra\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\yuvra\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\yuvra\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py", line 231, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\yuvra\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py", line 123, in _check_targets
    raise ValueError("{0} is not

In [53]:
# classifications 
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=1000)
clf.fit(X_class_train_prepared, y_class_train) 
y_class_pred = clf.predict(X_class_test_train)


In [54]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_class_test, y_class_pred))
print(classification_report(y_class_test, y_class_pred))


[[  0  43]
 [  0 257]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        43
           1       0.86      1.00      0.92       257

    accuracy                           0.86       300
   macro avg       0.43      0.50      0.46       300
weighted avg       0.73      0.86      0.79       300



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

# For regression
scores = cross_val_score(reg_model, X_lin_train_prepared, y_lin_train, cv=5, scoring="accuracy")
print('\n')
print("Cross-validation scores:", scores)
print("Mean Accuracy:", scores.mean())
print("Standard Deviation:", scores.std())
# For classifier
scores = cross_val_score(clf, X_class_train_prepared, y_class_train, cv=5, scoring="accuracy")
print("Cross-validation scores:", scores)
print("Mean Accuracy:", scores.mean())
print("Standard Deviation:", scores.std())



Cross-validation scores: [0.91428571 0.91428571 0.91428571 0.91428571 0.91428571]
Mean Accuracy: 0.9142857142857143
Standard Deviation: 0.0


Cross-validation scores: [nan nan nan nan nan]
Mean Accuracy: nan
Standard Deviation: nan


Traceback (most recent call last):
  File "c:\Users\yuvra\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 139, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "c:\Users\yuvra\anaconda3\Lib\site-packages\sklearn\metrics\_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\yuvra\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\yuvra\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py", line 231, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\yuvra\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py", line 123, in _check_targets
    raise ValueError("{0} is not