In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge,Lasso,RidgeCV,LassoCV,ElasticNet,ElasticNetCV,LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv('Admission_Prediction.csv')
df.head(3)

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337.0,118.0,4.0,4.5,4.5,9.65,1,0.92
1,2,324.0,107.0,4.0,4.0,4.5,8.87,1,0.76
2,3,,104.0,3.0,3.0,3.5,8.0,1,0.72


In [5]:
df.shape

(500, 9)

In [6]:
from pandas_profiling import ProfileReport

In [7]:
pf = ProfileReport(df)

In [8]:
pf.to_notebook_iframe()

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




# Handling Missing Values

In [9]:
df.isnull().sum()

Serial No.            0
GRE Score            15
TOEFL Score          10
University Rating    15
SOP                   0
LOR                   0
CGPA                  0
Research              0
Chance of Admit       0
dtype: int64

In [10]:
df['GRE Score'] = df['GRE Score'].fillna(df['GRE Score'].mean())

In [11]:
df['TOEFL Score'] = df['TOEFL Score'].fillna(df['TOEFL Score'].mean())

In [12]:
df['University Rating'] = df['University Rating'].fillna(df['University Rating'].mean())

In [13]:
df.isnull().sum()

Serial No.           0
GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64

In [14]:
df.drop(columns=['Serial No.'],axis=1,inplace=True)

In [15]:
y = df['Chance of Admit']
x = df.drop(columns=['Chance of Admit'])

In [16]:
x

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,337.000000,118.0,4.0,4.5,4.5,9.65,1
1,324.000000,107.0,4.0,4.0,4.5,8.87,1
2,316.558763,104.0,3.0,3.0,3.5,8.00,1
3,322.000000,110.0,3.0,3.5,2.5,8.67,1
4,314.000000,103.0,2.0,2.0,3.0,8.21,0
...,...,...,...,...,...,...,...
495,332.000000,108.0,5.0,4.5,4.0,9.02,1
496,337.000000,117.0,5.0,5.0,5.0,9.87,1
497,330.000000,120.0,5.0,4.5,5.0,9.56,1
498,312.000000,103.0,4.0,4.0,5.0,8.43,0


In [17]:
y

0      0.92
1      0.76
2      0.72
3      0.80
4      0.65
       ... 
495    0.87
496    0.96
497    0.93
498    0.73
499    0.84
Name: Chance of Admit, Length: 500, dtype: float64

# Standarization

In [18]:
scaler = StandardScaler()

In [19]:
arr = scaler.fit_transform(x)

In [20]:
df1 = pd.DataFrame(arr)

In [21]:
df1.profile_report()

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))






In [22]:
  df1.describe()

Unnamed: 0,0,1,2,3,4,5,6
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,4.384049e-15,9.521273e-16,3.979039e-16,-8.526513e-17,4.2632560000000003e-17,3.119283e-15,-7.81597e-17
std,1.001002,1.001002,1.001002,1.001002,1.001002,1.001002,1.001002
min,-2.394225,-2.512331,-1.881441,-2.39795,-2.686789,-2.940115,-1.128152
25%,-0.681409,-0.692731,-0.9946589,-0.8828175,-0.5235128,-0.7430227,-1.128152
50%,5.124333e-15,-0.03105811,-0.1078766,0.1272712,0.01730621,-0.02720919,0.8864053
75%,0.6708143,0.796033,0.7789057,0.6323155,0.5581253,0.7672196,0.8864053
max,2.113186,2.119379,1.665688,1.642404,1.639763,2.223672,0.8864053


In [23]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [24]:
arr.shape

(500, 7)

In [25]:
[variance_inflation_factor( arr , i  )for i in range(arr.shape[1])]

[4.1532675722258245,
 3.792866110594648,
 2.508768242278763,
 2.7757495092534965,
 2.0373076624897517,
 4.651669561154733,
 1.4593106786827286]

In [26]:
vif_df = pd.DataFrame()

In [27]:
vif_df['features'] = x.columns

In [28]:
vif_df['vif'] = [variance_inflation_factor( arr , i  )for i in range(arr.shape[1])]

In [29]:
vif_df

Unnamed: 0,features,vif
0,GRE Score,4.153268
1,TOEFL Score,3.792866
2,University Rating,2.508768
3,SOP,2.77575
4,LOR,2.037308
5,CGPA,4.65167
6,Research,1.459311


In [30]:
 X_train,X_test,y_train,y_test = train_test_split(arr,y,test_size = 0.15,random_state = 100)

In [31]:
X_train

array([[ 0.85111073,  0.46519653, -0.1078766 , ...,  0.01730621,
         0.30380282,  0.88640526],
       [-1.58289124, -1.1889856 , -1.88144112, ..., -1.60515091,
        -1.13609942, -1.12815215],
       [ 0.67081429,  0.63061474, -0.1078766 , ..., -2.14596996,
         0.35345462,  0.88640526],
       ...,
       [-1.04200191, -0.85814918, -0.99465886, ..., -1.06433187,
        -0.65613201, -1.12815215],
       [-0.50111259, -0.85814918, -0.1078766 , ...,  0.55812525,
         0.10519562,  0.88640526],
       [-1.31244657, -0.85814918, -1.88144112, ..., -2.14596996,
        -0.95404281, -1.12815215]])

In [32]:
lr = LinearRegression()

In [33]:
lr.fit(X_train,y_train)

LinearRegression()

In [34]:
import pickle

In [35]:
pickle.dump(lr,open('admission_lr_prediction.pickle','wb'))

In [36]:
df

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337.000000,118.0,4.0,4.5,4.5,9.65,1,0.92
1,324.000000,107.0,4.0,4.0,4.5,8.87,1,0.76
2,316.558763,104.0,3.0,3.0,3.5,8.00,1,0.72
3,322.000000,110.0,3.0,3.5,2.5,8.67,1,0.80
4,314.000000,103.0,2.0,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
495,332.000000,108.0,5.0,4.5,4.0,9.02,1,0.87
496,337.000000,117.0,5.0,5.0,5.0,9.87,1,0.96
497,330.000000,120.0,5.0,4.5,5.0,9.56,1,0.93
498,312.000000,103.0,4.0,4.0,5.0,8.43,0,0.73


In [37]:
new = scaler.transform([[337.000000,118.0,4.0,4.5,4.5,9.65,1]])

In [38]:
lr.predict(new)

array([0.95117594])

In [39]:
new = scaler.transform([[327.000000,113.0,4.0,4.5,4.5,9.04,0]])

In [40]:
lr.predict(new)

array([0.82189098])

In [41]:
lr.score(X_test,y_test)

0.8420039560601401

# Regulization

In [42]:
lassocv = LassoCV(alphas=None,cv = 10,max_iter=20000000,normalize=True)
lassocv.fit(X_train,y_train)

LassoCV(cv=10, max_iter=20000000, normalize=True)

In [43]:
alpha = lassocv.alpha_
alpha

3.65639233366671e-05

In [44]:
lasso = Lasso(alpha=alpha)
lasso.fit(X_train,y_train)

Lasso(alpha=3.65639233366671e-05)

In [45]:
 lasso.score(X_test,y_test)

0.8421103469011295

In [47]:
ridge = RidgeCV(alphas= (0.1, 1.0, 10.0) ,cv=10,normalize=True)
ridge.fit(X_train,y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=10, normalize=True)

In [48]:
r_reg = Ridge(alpha= ridge.alpha_)
r_reg.fit(X_train,y_train)

Ridge(alpha=0.1)

In [49]:
r_reg.score(X_test,y_test)

0.842005621044582