## Data Prep 

In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)

In [3]:
data = pd.read_csv('high_school.csv')

In [5]:
data.head()

Unnamed: 0,Gender,ParentEdu,ParentMaritalStatus,ExtraCurricular,IsFirstChild,Siblings,Transportation,AvgReadingScore,AvgWritingScore,traveltime,studytime,internet,freetime,absences,CGPA
0,female,bachelor's degree,married,regularly,yes,3.0,school_bus,71,74,2,2,no,3,6,C
1,female,some college,married,sometimes,yes,0.0,,90,88,1,2,yes,3,4,D
2,female,master's degree,single,sometimes,yes,4.0,school_bus,93,91,1,2,yes,3,10,B
3,male,associate's degree,married,never,no,1.0,,56,42,1,3,yes,2,2,F
4,male,some college,married,sometimes,yes,0.0,school_bus,78,75,1,2,no,3,4,C


In [4]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.3)

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [7]:
train_y = train['CGPA']
test_y = test['CGPA']

train_inputs = train.drop(['CGPA'], axis=1)
test_inputs = test.drop(['CGPA'], axis=1)

In [8]:
train_inputs.dtypes

Gender                  object
ParentEdu               object
ParentMaritalStatus     object
ExtraCurricular         object
IsFirstChild            object
Siblings               float64
Transportation          object
AvgReadingScore          int64
AvgWritingScore          int64
traveltime               int64
studytime                int64
internet                object
freetime                 int64
absences                 int64
dtype: object

In [9]:
# Identify the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

In [10]:
# Numeric transformer:

numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])


In [11]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [12]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)],
        remainder='drop')

#passtrough is an optional step. You don't have to use it.

In [13]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

array([[-0.07311637, -1.35881531, -1.65738705, ...,  1.        ,
         0.        ,  1.        ],
       [ 0.62359018, -0.26725329, -0.04930921, ...,  0.        ,
         0.        ,  1.        ],
       [-0.76982292, -0.54014379, -0.56389412, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.76982292,  0.41497297,  0.46527569, ...,  0.        ,
         0.        ,  1.        ],
       [-1.46652948, -1.97281894, -1.40009459, ...,  1.        ,
         0.        ,  1.        ],
       [-0.76982292,  0.41497297,  0.27230635, ...,  0.        ,
         0.        ,  1.        ]])

In [14]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

array([[-0.07311637,  1.43831236,  0.91553748, ...,  0.        ,
         0.        ,  1.        ],
       [-0.07311637,  0.55141822,  0.07933701, ...,  0.        ,
         0.        ,  1.        ],
       [-0.07311637, -0.33547592,  0.0150139 , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.62359018, -2.17748682, -2.17197195, ...,  1.        ,
         0.        ,  1.        ],
       [-0.76982292, -1.56348318, -1.65738705, ...,  0.        ,
         0.        ,  1.        ],
       [-0.76982292, -1.42703793, -1.20712525, ...,  0.        ,
         0.        ,  1.        ]])

## Find the Baseline 

In [15]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf.fit(train_x, train_y)

In [16]:
from sklearn.metrics import accuracy_score

In [17]:
#Baseline Train Accuracy
dummy_train_pred = dummy_clf.predict(train_x)

baseline_train_acc = accuracy_score(train_y, dummy_train_pred)

print('Baseline Train Accuracy: {}' .format(baseline_train_acc))

Baseline Train Accuracy: 0.33293124246079614


In [18]:
#Baseline Test Accuracy
dummy_test_pred = dummy_clf.predict(test_x)

baseline_test_acc = accuracy_score(test_y, dummy_test_pred)

print('Baseline Test Accuracy: {}' .format(baseline_test_acc))

Baseline Test Accuracy: 0.3263009845288326


## SVM Model 1:

In [176]:
from sklearn.svm import SVC
svm_clf = SVC(kernel="linear", C=15, decision_function_shape='ovr')

svm_clf.fit(train_x, train_y)

In [177]:
#Predict the train values
train_y_pred = svm_clf.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.6827503015681544

In [178]:
#Predict the test values
test_y_pred = svm_clf.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.6343178621659634

In [33]:
from sklearn.metrics import confusion_matrix

#We usually create the confusion matrix on test set
confusion_matrix(test_y, test_y_pred)
confusion_matrix(test_y, test_y_pred)

array([[ 37,  14,   1,   0,   0],
       [ 12,  41,  34,   2,   1],
       [  2,  23,  87,  37,   2],
       [  0,   3,  51,  92,  40],
       [  0,   0,   1,  39, 192]])

## SVM Model 2:

In [95]:
pol_svm2 = SVC(kernel="poly", degree=3, coef0=1, C=10, decision_function_shape='ovr')

pol_svm2.fit(train_x, train_y)

In [96]:
#Predict the train values
train_y_pred = pol_svm2.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.9728588661037394

In [97]:
#Predict the test values
test_y_pred = pol_svm2.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.5246132208157525

In [87]:
confusion_matrix(test_y, test_y_pred)

array([[ 22,  22,   8,   0,   0],
       [  5,  37,  42,   5,   1],
       [  0,  10,  96,  42,   3],
       [  0,   0,  60,  80,  46],
       [  0,   0,   2,  34, 196]])

In [152]:
pol_svm2 = SVC(kernel="poly", degree=3, coef0=1, C=.055, decision_function_shape='ovr')
pol_svm2.fit(train_x, train_y)

In [153]:
train_y_pred = pol_svm2.predict(train_x)
accuracy_score(train_y, train_y_pred)

0.6899879372738239

In [154]:
test_y_pred = pol_svm2.predict(test_x)
accuracy_score(test_y, test_y_pred)

0.6118143459915611

## SVM Model 3:

In [54]:
rbf_svm = SVC(kernel="rbf", C=10, gamma=0.1, decision_function_shape='ovr')

rbf_svm.fit(train_x, train_y)

In [55]:
#Predict the train values
train_y_pred = rbf_svm.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.974065138721351

In [56]:
#Predict the test values
test_y_pred = rbf_svm.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.5330520393811533

In [57]:
confusion_matrix(test_y, test_y_pred)

array([[ 29,  20,   3,   0,   0],
       [ 15,  35,  27,  13,   0],
       [  5,  25,  72,  45,   4],
       [  1,   8,  67,  65,  45],
       [  0,   0,   9,  45, 178]])

In [170]:
rbf_svm = SVC(kernel="rbf", C=1.5, gamma=.01, decision_function_shape='ovr')

rbf_svm.fit(train_x, train_y)

In [171]:
train_y_pred = rbf_svm.predict(train_x)
accuracy_score(train_y, train_y_pred)

0.6483715319662243

In [172]:
test_y_pred = rbf_svm.predict(test_x)
accuracy_score(test_y, test_y_pred)

0.5921237693389592

In [179]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 4 (2×2) combinations of hyperparameters
    {'C': [.5, 5], 
     'gamma': [0.01, 0.05]}
  ]

rbf_svm = SVC(kernel="rbf", decision_function_shape='ovr')

# train across 5 folds, that's a total of 4*5=20 rounds of training 
grid_search = GridSearchCV(rbf_svm, param_grid, cv=10,
                           scoring='accuracy', return_train_score=True)

grid_search.fit(train_x, train_y)

In [None]:
grid_search.best_params_

## SGD Model 1:

In [59]:
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

In [188]:
# SGD with L2 (Ridge) regularization
sgd_model_1 = SGDClassifier(penalty='l2', max_iter=1000, tol=1e-3, random_state=42)
sgd_model_1.fit(train_x, train_y)


In [189]:

train_y_pred_sgd1 = sgd_model_1.predict(train_x)
accuracy_score(train_y, train_y_pred_sgd1)


0.5615199034981906

In [190]:
test_y_pred_sgd1 = sgd_model_1.predict(test_x)
accuracy_score(test_y, test_y_pred_sgd1)

0.559774964838256

## SGD Model 2:

In [191]:
poly = PolynomialFeatures(degree=2, include_bias=False)
train_x_poly = poly.fit_transform(train_x)
test_x_poly = poly.transform(test_x)


In [192]:
sgd_model_2 = SGDClassifier(penalty='l1', max_iter=1000, tol=1e-3, random_state=42)
sgd_model_2.fit(train_x_poly, train_y)

In [193]:
train_y_pred_sgd2 = sgd_model_2.predict(train_x_poly)
accuracy_score(train_y, train_y_pred_sgd2)

0.8317249698431846

In [194]:
test_y_pred_sgd2 = sgd_model_2.predict(test_x_poly)
accuracy_score(test_y, test_y_pred_sgd2)

0.5344585091420534

In [211]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'penalty': ['l1', 'l2'],
    'alpha': [0.0001, 0.001, 0.01],  # Regularization strength
    'max_iter': [500, 1000, 1500],
    'tol': [1e-3, 1e-2]
}

grid_search = GridSearchCV(SGDClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(train_x_poly, train_y)

grid_search.best_params_

{'alpha': 0.001, 'max_iter': 500, 'penalty': 'l1', 'tol': 0.01}

In [230]:
sgd_model_2 = sgd_model_elasticnet = SGDClassifier(
    penalty='elasticnet',
    alpha=0.075,       
    l1_ratio=0.3,    
    max_iter=1000,
    tol=0.01,
    random_state=42
)
sgd_model_2.fit(train_x_poly, train_y)


In [231]:
train_y_pred_sgd2 = sgd_model_2.predict(train_x_poly)
accuracy_score(train_y, train_y_pred_sgd2)

0.5180940892641737

In [232]:
test_y_pred_sgd2 = sgd_model_2.predict(test_x_poly)
accuracy_score(test_y, test_y_pred_sgd2)

0.5035161744022504

## LogisticRegression Model:

In [77]:
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(train_x, train_y)


In [78]:
train_y_pred_logistic = logistic_model.predict(train_x)
accuracy_score(train_y, train_y_pred_logistic)

0.6767189384800965

In [79]:
test_y_pred_logistic = logistic_model.predict(test_x)
accuracy_score(test_y, test_y_pred_logistic)

0.6272855133614628