In [41]:
import kagglehub
import pandas as pd
from pathlib import Path

# Download latest version
path = kagglehub.dataset_download("mohansacharya/graduate-admissions")
df = pd.read_csv(Path(path, "Admission_Predict.csv"))

print("Path to dataset files:", path)
display(df.head())
display(df.info())

Path to dataset files: C:\Users\zhatz\.cache\kagglehub\datasets\mohansacharya\graduate-admissions\versions\2


Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Serial No.         400 non-null    int64  
 1   GRE Score          400 non-null    int64  
 2   TOEFL Score        400 non-null    int64  
 3   University Rating  400 non-null    int64  
 4   SOP                400 non-null    float64
 5   LOR                400 non-null    float64
 6   CGPA               400 non-null    float64
 7   Research           400 non-null    int64  
 8   Chance of Admit    400 non-null    float64
dtypes: float64(4), int64(5)
memory usage: 28.2 KB


None

### Data Imputation Preprocessing

In [42]:
## In your analysis of the data consider the following: ##
# 1) Understand the nature of your data (categorical, numerical, time-series, etc.).
# 2) Identify the pattern of missingness (Missing Completely at Random, Missing at Random, Missing Not at Random). 

import pandas as pd
import numpy as np
import random

df_na = df.copy()
random.seed(3)
# replace 10% of data in each column, except "y" columns, with NaN to simulate a data set with missing data
for col in df_na.columns:
    # avoid creating nans in y column "Chance of Admit"
    if col != "Chance of Admit ":
        random_float = random.uniform(0.07, 0.12)
        df_na.loc[df_na.sample(frac=random_float).index, col] = np.nan

print("Path to dataset files:", path)
display(df_na.head())
display(df_na.info())

Path to dataset files: C:\Users\zhatz\.cache\kagglehub\datasets\mohansacharya\graduate-admissions\versions\2


Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1.0,337.0,118.0,4.0,4.5,4.5,9.65,1.0,0.92
1,2.0,324.0,107.0,4.0,4.0,4.5,8.87,1.0,0.76
2,3.0,316.0,104.0,3.0,3.0,3.5,8.0,1.0,0.72
3,4.0,322.0,110.0,3.0,3.5,2.5,8.67,1.0,0.8
4,5.0,,103.0,2.0,2.0,3.0,8.21,0.0,0.65


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Serial No.         367 non-null    float64
 1   GRE Score          361 non-null    float64
 2   TOEFL Score        365 non-null    float64
 3   University Rating  360 non-null    float64
 4   SOP                359 non-null    float64
 5   LOR                371 non-null    float64
 6   CGPA               372 non-null    float64
 7   Research           355 non-null    float64
 8   Chance of Admit    400 non-null    float64
dtypes: float64(9)
memory usage: 28.2 KB


None

#### Simple Imputer for Handling Missing Data

In [43]:
class SimpleImputer:

    def __init__(self, data: pd.DataFrame, method: str = "mean"):
        self.data = data
        self.method = method
        self.numeric_cols = self.data.select_dtypes(include=["number"])
        self.object_cols = self.data.select_dtypes(include=["object"])
    
    def mean_meadian_mode(self):
        for col in self.numeric_cols:
            if self.method == "mean":
                self.data.loc[self.data[col].isna(), col] =  np.nanmean(self.data[col])
            else:
                self.data.loc[self.data[col].isna(), col] =  np.nanmedian(self.data[col])
        
        for col in self.object_cols:
            self.data.loc[self.data[col].isna(), col] =  self.data[col].mode().values[0]
        
        return self.data

In [44]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

imputer = SimpleImputer(df_na.copy(), method="mean")
df_na_imputed = imputer.mean_meadian_mode()

X = df_na_imputed.loc[:, df_na_imputed.columns != "Chance of Admit "]
y = df_na_imputed.loc[:, df_na_imputed.columns == "Chance of Admit "]

regressor = LinearRegression()

simple_only = np.mean(cross_val_score(estimator=regressor, X=X, y=y, scoring="r2"))
simple_only

0.7469658184856826

### Standardization of the Data

In [45]:
from sklearn.preprocessing import StandardScaler

df_scaler = df_na.copy().dropna()

X = df_scaler.loc[:, df_scaler.columns != "Chance of Admit "]
y = df_scaler.loc[:, df_scaler.columns == "Chance of Admit "]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

regressor = LinearRegression()

standard_only = np.mean(cross_val_score(estimator=regressor, X=X_scaled, y=y, scoring="r2"))
standard_only

0.7380096763702925

### Feature Selection

In [46]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

class RecursiveFeatureSelection:

    def __init__(self, X: pd.DataFrame, y: pd.Series, num_features: int, scaled: bool = False):
        self.X = X
        self.y = y
        self.columns = list(self.X.columns)
        self.num_features = num_features
        self.scaled = scaled
        
    def evaluate(self):
        # Create the pipeline
        len_col = len(self.X.columns)
        iterations = (len_col - self.num_features) if self.num_features < len_col else 1
        
        if not self.scaled:
            pipeline = Pipeline([
                ('scaler', StandardScaler()),  # Step 1: Standardize the features
                ('regressor', LinearRegression()) # Step 2: Apply Linear Regression
            ])
        else:
            pipeline = Pipeline([
                ('regressor', LinearRegression()) # Step 2: Apply Linear Regression
            ])

        for _ in range(iterations):
            X = self.X.values
            y = self.y.values
            reg = pipeline.fit(X, y)
            coef = reg["regressor"].coef_
            min_val = np.argmin(coef)
            remove_col = self.columns[min_val]
            self.X = self.X.loc[:, self.X.columns != remove_col]
            self.columns.remove(remove_col)
        
        X = self.X.values
        y = self.y.values
        reg = pipeline.fit(X, y)
        coef = reg["regressor"].coef_
        
        return self.X, coef



In [47]:
df_scaler = df_na.copy().dropna()

X = df_scaler.loc[:, df_scaler.columns != "Chance of Admit "]
y = df_scaler.loc[:, df_scaler.columns == "Chance of Admit "]

selector = RecursiveFeatureSelection(X=X, y=y, num_features=int(X.shape[1]/2), scaled=True)
X_selected, _ = selector.evaluate()

regressor = LinearRegression()

feature_only = np.mean(cross_val_score(estimator=regressor, X=X_selected, y=y, scoring="r2"))
feature_only

0.7406547255177995

In [48]:
df_combined = df_na_imputed.copy().dropna()

X = df_combined.loc[:, df_combined.columns != "Chance of Admit "]
y = df_combined.loc[:, df_combined.columns == "Chance of Admit "]

selector = RecursiveFeatureSelection(X=X, y=y, num_features=int(X.shape[1]/2), scaled=False)
X_selected, _ = selector.evaluate()

regressor = LinearRegression()

all_methods_1 = np.mean(cross_val_score(estimator=regressor, X=X_selected, y=y, scoring="r2"))
all_methods_1

0.7404837750439835

### Advanced Data Imputation

In [49]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

class RandomForestImputer:

    def __init__(self, data: pd.DataFrame, random_state: int = 42):
        self.data = data.copy()
        self.random_state = random_state
        self.encoders = {}   # store label encoders for categorical columns

    def fit_transform(self):
        df_imputed = self.data.copy()

        for col in df_imputed.columns:
            
            if df_imputed[col].isna().sum() == 0:
                continue  # no missing values in this column

            print(f"Imputing column: {col}")

            # Split into observed and missing
            observed = df_imputed[df_imputed[col].notna()]
            missing = df_imputed[df_imputed[col].isna()]

            # Features = all other columns
            features = df_imputed.columns.drop(col)
            X_train = observed[features]
            X_test = missing[features]

            # Encode categoricals for ML
            # observed_enc = self._encode_categoricals(observed[features])
            # missing_enc = self._encode_categoricals(missing[features])

            # Target
            y = observed[col]

            model = RandomForestRegressor(n_estimators=100, random_state=self.random_state)

            # Train
            model.fit(X_train, y)

            # Predict missing values
            preds = model.predict(X_test)

            # Fill in missing values
            df_imputed.loc[df_imputed[col].isna(), col] = preds

        return df_imputed



In [50]:
df_simple = df_na.copy()
df_forest = df_na.copy()

imputer = SimpleImputer(df_simple)
simple_imputed = imputer.mean_meadian_mode()
print("Mean/Median/Mode Imputer:")
display(simple_imputed.describe())

imputer_forest = RandomForestImputer(df_forest)
forest_imputed = imputer_forest.fit_transform()
print("Random Forest Imputer:")
display(forest_imputed.describe())

Mean/Median/Mode Imputer:


Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,200.381471,316.711911,107.243836,3.111111,3.399721,3.46496,8.602715,0.552113,0.72435
std,111.785852,10.943239,5.818323,1.094021,0.951703,0.865728,0.576027,0.469057,0.142609
min,1.0,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,106.75,309.75,103.0,2.0,3.0,3.0,8.2175,0.0,0.64
50%,200.381471,316.711911,107.243836,3.0,3.5,3.5,8.602715,0.552113,0.73
75%,292.25,324.0,111.0,4.0,4.0,4.0,9.04,1.0,0.83
max,400.0,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


Imputing column: Serial No.
Imputing column: GRE Score
Imputing column: TOEFL Score
Imputing column: University Rating
Imputing column: SOP
Imputing column: LOR 
Imputing column: CGPA
Imputing column: Research
Random Forest Imputer:


Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,201.447045,316.774998,107.247595,3.105748,3.403997,3.462395,8.601608,0.551275,0.72435
std,112.043066,11.248247,5.976368,1.118662,0.969764,0.878817,0.586965,0.479173,0.142609
min,1.0,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,106.75,309.125201,103.0,2.0,2.658819,3.0,8.2175,0.0,0.64
50%,204.5,317.0,107.0,3.0,3.5,3.5,8.57,0.965,0.73
75%,292.25,325.0,111.0,4.0,4.0,4.0,9.060202,1.0,0.83
max,400.0,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


In [51]:
df_combined = simple_imputed.copy()

X = df_combined.loc[:, df_combined.columns != "Chance of Admit "]
y = df_combined.loc[:, df_combined.columns == "Chance of Admit "]

selector = RecursiveFeatureSelection(X=X, y=y, num_features=int(X.shape[1]), scaled=False)
X_selected, _ = selector.evaluate()

regressor = LinearRegression()

simple_w_all = np.mean(cross_val_score(estimator=regressor, X=X_selected, y=y, scoring="r2"))
simple_w_all

0.7479969272439024

In [52]:
df_combined = forest_imputed.copy()

X = df_combined.loc[:, df_combined.columns != "Chance of Admit "]
y = df_combined.loc[:, df_combined.columns == "Chance of Admit "]

selector = RecursiveFeatureSelection(X=X, y=y, num_features=int(X.shape[1]), scaled=False)
X_selected, _ = selector.evaluate()

regressor = LinearRegression()

advanced_w_all = np.mean(cross_val_score(estimator=regressor, X=X_selected, y=y, scoring="r2"))
advanced_w_all

0.7703095028448063

In [53]:
results = {
    "Simple Imputation": simple_only,
    "Standardization (after row deletion)": standard_only,
    "Feature Selection": feature_only,
    "Advanced Imputation (Random Forest)": advanced_w_all,
    "All Methods Combined": all_methods_1,
}
pd.DataFrame.from_dict(results, orient="index", columns=["R2 Score"]).round(2)

Unnamed: 0,R2 Score
Simple Imputation,0.75
Standardization (after row deletion),0.74
Feature Selection,0.74
Advanced Imputation (Random Forest),0.77
All Methods Combined,0.74
