In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [51]:
df = pd.read_csv("university_admission.csv")

In [52]:
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [53]:
df.describe()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,200.5,316.8075,107.41,3.0875,3.4,3.4525,8.598925,0.5475,0.72435
std,115.614301,11.473646,6.069514,1.143728,1.006869,0.898478,0.596317,0.498362,0.142609
min,1.0,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,100.75,308.0,103.0,2.0,2.5,3.0,8.17,0.0,0.64
50%,200.5,317.0,107.0,3.0,3.5,3.5,8.61,1.0,0.73
75%,300.25,325.0,112.0,4.0,4.0,4.0,9.0625,1.0,0.83
max,400.0,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Serial No.         400 non-null    int64  
 1   GRE Score          400 non-null    int64  
 2   TOEFL Score        400 non-null    int64  
 3   University Rating  400 non-null    int64  
 4   SOP                400 non-null    float64
 5   LOR                400 non-null    float64
 6   CGPA               400 non-null    float64
 7   Research           400 non-null    int64  
 8   Chance of Admit    400 non-null    float64
dtypes: float64(4), int64(5)
memory usage: 28.2 KB


In [55]:
df.columns

Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
       'LOR ', 'CGPA', 'Research', 'Chance of Admit '],
      dtype='object')

In [56]:
X = df.drop(['Serial No.', 'Chance of Admit '], axis=1)
y = df['Chance of Admit ']

In [57]:
y

0      0.92
1      0.76
2      0.72
3      0.80
4      0.65
       ... 
395    0.82
396    0.84
397    0.91
398    0.67
399    0.95
Name: Chance of Admit , Length: 400, dtype: float64

In [58]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [59]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler

In [60]:
clf_ct = make_column_transformer((MinMaxScaler(), ['GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
       'LOR ', 'CGPA', 'Research']))

In [61]:
clf_ct.fit(X_train)

In [62]:
X_train_normal = clf_ct.transform(X_train)
X_test_normal = clf_ct.transform(X_test)

Simple Liner Regression Model1

In [63]:
from sklearn.linear_model import LinearRegression
model1 = LinearRegression()
model1.fit(X_train_normal, y_train)

In [64]:
y_pred_model1 = model1.predict(X_test_normal)

In [65]:
df_y_pred_test = pd.DataFrame(data={'predictions': y_pred_model1, 'actual': y_test})

In [66]:
from sklearn.metrics import max_error, mean_absolute_error, r2_score

In [67]:
def metrics_summary(y_true, y_pred):
  from sklearn.metrics import max_error, mean_absolute_error, mean_squared_error, r2_score
  ''' This Function Returns the important Evaluation Metrics for a Simple Liner Regression Model'''
  print("Maximum Error:", max_error(y_true, y_pred))
  print("MSE:", mean_squared_error(y_true, y_pred))
  print("MAE:", mean_absolute_error(y_true, y_pred))
  print("R2-Score:", r2_score(y_true, y_pred))



In [68]:
metrics_summary(y_test, y_pred_model1)

Maximum Error: 0.2700883272163071
MSE: 0.004617003377285013
MAE: 0.04795673362091198
R2-Score: 0.8212082591486991


Model2
Decision Tree

In [69]:
from sklearn.tree import DecisionTreeRegressor
model2 = DecisionTreeRegressor(criterion="squared_error", splitter='best', max_depth=5, min_samples_split=10, max_features=6, random_state=42)
model2.fit(X_train_normal, y_train)

In [70]:
y_pred_model2 = model2.predict(X_test_normal)

In [71]:
metrics_summary(y_true=y_test, y_pred=y_pred_model2)

Maximum Error: 0.2960000000000001
MSE: 0.005408997133305893
MAE: 0.0519019969158801
R2-Score: 0.7905385951239781


Model 3 Random forest

In [158]:
from sklearn.ensemble import RandomForestRegressor
model3 = RandomForestRegressor(max_depth=5, max_features=6, min_samples_leaf=5, min_samples_split=5, n_estimators=10, random_state=42)

In [159]:
model3.fit(X_train_normal, y_train)

In [160]:
y_pred_model3 = model3.predict(X_test_normal)

In [161]:
metrics_summary(y_test, y_pred_model3)

Maximum Error: 0.23947000059119622
MSE: 0.004414663116247703
MAE: 0.04689880787361319
R2-Score: 0.8290438106003509


In [162]:
model3.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': 5,
 'max_features': 6,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 5,
 'min_samples_split': 5,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [184]:
parameter_list = [{
'max_depth': [3, 5],
'max_features': [4, 5, 6],

'min_samples_split': [3, 5, 9],
'n_estimators': range(10, 101, 10)}]

In [194]:
from sklearn.model_selection import GridSearchCV
model_forest_cv = GridSearchCV(estimator=model3, param_grid=parameter_list, scoring='r2')

In [195]:
model_forest_cv.fit(X_train_normal, y_train)

In [196]:
print(model_forest_cv.best_params_)
print(model_forest_cv.best_estimator_)
print(model_forest_cv.best_score_)

{'max_depth': 5, 'max_features': 4, 'min_samples_split': 3, 'n_estimators': 100}
RandomForestRegressor(max_depth=5, max_features=4, min_samples_leaf=5,
                      min_samples_split=3, random_state=42)
0.7703624296180791


In [197]:
y_pred_cv = model_forest_cv.predict(X_test_normal)

In [198]:
metrics_summary(y_test, y_pred_cv)

Maximum Error: 0.26859245798199954
MSE: 0.004639862510585589
MAE: 0.04736537502686723
R2-Score: 0.8203230477024026
