In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, LassoCV, Ridge, RidgeCV, ElasticNet, ElasticNetCV, LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
import pickle

#LassoCV ---> RidgeCV ---> ElasticNetCV ---> #Here CV means cross validation

###### pickle is a module in Python that is used to serialize and deserialize Python objects. Serialization is the process of converting a Python object into a byte stream, allowing you to save it to a file or send it over a network. Deserialization is the reverse process, where you convert the byte stream back into a Python object.

In [4]:
df = pd.read_csv("Admission_Predict.csv")
df

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.00,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.80
4,5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
495,496,332,108,5,4.5,4.0,9.02,1,0.87
496,497,337,117,5,5.0,5.0,9.87,1,0.96
497,498,330,120,5,4.5,5.0,9.56,1,0.93
498,499,312,103,4,4.0,5.0,8.43,0,0.73


In [5]:
yf = ProfileReport(df)
yf.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
#usig Imputation handle missing values
df["GRE Score"] = df["GRE Score"].fillna(df["GRE Score"].mean())
df["TOEFL Score"] = df["TOEFL Score"].fillna(df["TOEFL Score"].mean())
df["University Rating"] = df["University Rating"].fillna(df["University Rating"].mean())

In [7]:
df.describe()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,250.5,316.472,107.192,3.114,3.374,3.484,8.57644,0.56,0.72174
std,144.481833,11.295148,6.081868,1.143512,0.991004,0.92545,0.604813,0.496884,0.14114
min,1.0,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,125.75,308.0,103.0,2.0,2.5,3.0,8.1275,0.0,0.63
50%,250.5,317.0,107.0,3.0,3.5,3.5,8.56,1.0,0.72
75%,375.25,325.0,112.0,4.0,4.0,4.0,9.04,1.0,0.82
max,500.0,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


In [8]:
df.isnull().sum()

Serial No.           0
GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64

In [9]:
#drop the Serial No. columns
df.drop(columns = ["Serial No."], inplace = True)

In [10]:
df

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.00,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.80
4,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
495,332,108,5,4.5,4.0,9.02,1,0.87
496,337,117,5,5.0,5.0,9.87,1,0.96
497,330,120,5,4.5,5.0,9.56,1,0.93
498,312,103,4,4.0,5.0,8.43,0,0.73


In [11]:
#create label columns
y = df["Chance of Admit "]
y

0      0.92
1      0.76
2      0.72
3      0.80
4      0.65
       ... 
495    0.87
496    0.96
497    0.93
498    0.73
499    0.84
Name: Chance of Admit , Length: 500, dtype: float64

In [12]:
#create features columns
x = df.drop(columns = ["Chance of Admit "])
x

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,337,118,4,4.5,4.5,9.65,1
1,324,107,4,4.0,4.5,8.87,1
2,316,104,3,3.0,3.5,8.00,1
3,322,110,3,3.5,2.5,8.67,1
4,314,103,2,2.0,3.0,8.21,0
...,...,...,...,...,...,...,...
495,332,108,5,4.5,4.0,9.02,1
496,337,117,5,5.0,5.0,9.87,1
497,330,120,5,4.5,5.0,9.56,1
498,312,103,4,4.0,5.0,8.43,0


In [13]:
#we will perfom some normalization and standardization 
scaler = StandardScaler()

In [14]:
arr = scaler.fit_transform(x)
arr

array([[ 1.81923762,  1.77886545,  0.77558214, ...,  1.09894429,
         1.77680627,  0.88640526],
       [ 0.66714832, -0.03160087,  0.77558214, ...,  1.09894429,
         0.48585943,  0.88640526],
       [-0.0418297 , -0.52536441, -0.09979274, ...,  0.01730621,
        -0.95404281,  0.88640526],
       ...,
       [ 1.19888185,  2.10804114,  1.65095702, ...,  1.63976333,
         1.62785086,  0.88640526],
       [-0.39631872, -0.68995225,  0.77558214, ...,  1.63976333,
        -0.24236699, -1.12815215],
       [ 0.93301508,  0.95592621,  0.77558214, ...,  1.09894429,
         0.76721964, -1.12815215]])

In [15]:
df1 = pd.DataFrame(arr)
df1

Unnamed: 0,0,1,2,3,4,5,6
0,1.819238,1.778865,0.775582,1.137360,1.098944,1.776806,0.886405
1,0.667148,-0.031601,0.775582,0.632315,1.098944,0.485859,0.886405
2,-0.041830,-0.525364,-0.099793,-0.377773,0.017306,-0.954043,0.886405
3,0.489904,0.462163,-0.099793,0.127271,-1.064332,0.154847,0.886405
4,-0.219074,-0.689952,-0.975168,-1.387862,-0.523513,-0.606480,-1.128152
...,...,...,...,...,...,...,...
495,1.376126,0.132987,1.650957,1.137360,0.558125,0.734118,0.886405
496,1.819238,1.614278,1.650957,1.642404,1.639763,2.140919,0.886405
497,1.198882,2.108041,1.650957,1.137360,1.639763,1.627851,0.886405
498,-0.396319,-0.689952,0.775582,0.632315,1.639763,-0.242367,-1.128152


In [16]:
yf2 = ProfileReport(df1)
yf2.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [17]:
df1.describe()

Unnamed: 0,0,1,2,3,4,5,6
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,1.762146e-15,1.136868e-15,1.421085e-16,-8.526513e-17,4.2632560000000003e-17,3.119283e-15,-7.81597e-17
std,1.001002,1.001002,1.001002,1.001002,1.001002,1.001002,1.001002
min,-2.346008,-2.500419,-1.850542,-2.39795,-2.686789,-2.940115,-1.128152
25%,-0.7508077,-0.6899523,-0.9751676,-0.8828175,-0.5235128,-0.7430227,-1.128152
50%,0.04679255,-0.03160087,-0.09979274,0.1272712,0.01730621,-0.02720919,0.8864053
75%,0.7557706,0.7913384,0.7755821,0.6323155,0.5581253,0.7672196,0.8864053
max,2.085104,2.108041,1.650957,1.642404,1.639763,2.223672,0.8864053


In [18]:
#check multicollinearity using VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [19]:
arr.shape

(500, 7)

In [20]:
arr.shape[0] #it will give rows

500

In [21]:
arr.shape[1] #it will give columns

7

In [22]:
vif_df = pd.DataFrame()
vif_df["vif"] = [variance_inflation_factor(arr, i) for i in range(arr.shape[1])]
vif_df["feature"] = x.columns
vif_df
#so there is no multicollinearity

Unnamed: 0,vif,feature
0,4.464249,GRE Score
1,3.904213,TOEFL Score
2,2.621036,University Rating
3,2.83521,SOP
4,2.033555,LOR
5,4.777992,CGPA
6,1.494008,Research


In [23]:
arr

array([[ 1.81923762,  1.77886545,  0.77558214, ...,  1.09894429,
         1.77680627,  0.88640526],
       [ 0.66714832, -0.03160087,  0.77558214, ...,  1.09894429,
         0.48585943,  0.88640526],
       [-0.0418297 , -0.52536441, -0.09979274, ...,  0.01730621,
        -0.95404281,  0.88640526],
       ...,
       [ 1.19888185,  2.10804114,  1.65095702, ...,  1.63976333,
         1.62785086,  0.88640526],
       [-0.39631872, -0.68995225,  0.77558214, ...,  1.63976333,
        -0.24236699, -1.12815215],
       [ 0.93301508,  0.95592621,  0.77558214, ...,  1.09894429,
         0.76721964, -1.12815215]])

In [72]:
#split the data set
x_train,x_test, y_train, y_test = train_test_split(arr, y, test_size = 0.20, random_state = 100) #it will return 4 tuples
#using random_state we can fixed the randomness

In [73]:
x_train

array([[-1.90289703, -1.34830364, -0.97516761, ..., -0.52351283,
        -1.53331383, -1.12815215],
       [-0.48494097, -1.01912795, -0.09979274, ..., -1.06433187,
        -1.54986443,  0.88640526],
       [ 1.37612635,  1.77886545,  1.65095702, ...,  1.63976333,
         1.47889546,  0.88640526],
       ...,
       [-1.01667449, -0.8545401 , -0.97516761, ..., -1.06433187,
        -0.65613201, -1.12815215],
       [-0.48494097, -0.8545401 , -0.09979274, ...,  0.55812525,
         0.10519562,  0.88640526],
       [-1.28254125, -0.8545401 , -1.85054249, ..., -2.14596996,
        -0.95404281, -1.12815215]])

In [74]:
model = LinearRegression()

In [75]:
model.fit(x_train, y_train)

In [76]:
pickle.dump(model, open("admission_model.pickle", "wb"))

In [77]:
!dir

 Volume in drive G is programming
 Volume Serial Number is A6C7-630C

 Directory of G:\Data Science\iNeuron Full Stack Data Science Full Course\iNeuron Full Stack Data Science Full Course - Repo\05 Machine Learning\Lecture 05 - Build Model With Regularization

15/08/2024  17:56    <DIR>          .
13/08/2024  08:25    <DIR>          ..
13/08/2024  17:46    <DIR>          .ipynb_checkpoints
15/08/2024  17:56               515 admission_model.pickle
14/08/2024  09:44            15,677 Admission_Predict.csv
15/08/2024  17:56            59,056 Untitled.ipynb
               3 File(s)         75,248 bytes
               3 Dir(s)  63,771,607,040 bytes free


In [78]:
test1 = [[1.819238, 1.778865, 0.775582, 1.137360, 1.098944, 1.776806, 0.886405]]

In [79]:
model.predict(test1)

array([0.95090519])

In [80]:
model.score(x_test, y_test)

0.8305208734305329

In [81]:
#Lets create method to adjusted R-Squared
def adj_r2(x, y):
    r2 = model.score(x, y)
    n = x.shape[0]
    p = x.shape[1]
    adjust_r2 = 1 - (1 - r2) * (n - 1) / (n - p -1)
    return adjust_r2

In [82]:
adj_r2(x_test, y_test)

0.8176257224958996

In [83]:
model.intercept_

0.7187950523329237

In [84]:
model.coef_

array([ 0.01727237,  0.01888738,  0.00569533, -0.00100666,  0.0178151 ,
        0.07453718,  0.0133137 ])

In [99]:
#Apply regularization
lassocv = LassoCV(alphas = None, cv = 50, max_iter = 2000000000)
lassocv.fit(x_train, y_train)

In [100]:
lassocv.alpha_

0.0004916054994787067

In [101]:
lasso = Lasso(alpha = lassocv.alpha_)
lasso.fit(x_train, y_train)

In [102]:
lasso.score(x_test, y_test)

0.8313049849138283

In [114]:
#lets use Ridge
ridgecv = RidgeCV(alphas = np.random.uniform(0, 10, 50), cv = 10)
ridgecv.fit(x_train, y_train)

In [115]:
ridgecv.alpha_

7.2272923159389375

In [116]:
np.random.uniform(0, 10, 50)

array([0.2057216 , 6.91161111, 0.19816443, 6.09866642, 7.09665168,
       6.47054516, 5.33457892, 1.86022956, 9.26983058, 7.90314631,
       9.92679927, 9.27345102, 5.1223807 , 3.58220763, 7.44138249,
       4.48033116, 2.95378242, 9.95387961, 1.02614095, 7.8048515 ,
       2.1247792 , 3.57908691, 2.50762417, 6.67135561, 9.19134084,
       3.94284843, 0.08232187, 1.03383986, 3.5116808 , 5.15600618,
       9.5912849 , 3.40055603, 7.31851438, 9.78434479, 5.37883163,
       0.32040662, 7.33578808, 8.6785938 , 9.73431371, 9.03533535,
       2.55144581, 6.38748923, 9.01269461, 6.30071882, 2.09510755,
       8.63389775, 0.72467696, 0.48155768, 1.5663945 , 1.94233668])

In [117]:
ridge_lr = Ridge(alpha = ridgecv.alpha_)
ridge_lr.fit(x_train, y_train)

In [119]:
ridge_lr.score(x_test, y_test)

0.8326728347443222

In [124]:
#lest elasticnet
elastic = ElasticNetCV(alphas = None, cv = 10)
elastic.fit(x_train, y_train)

In [125]:
elastic.alpha_

0.0021182638837747543

In [126]:
elastic.l1_ratio_

0.5

In [128]:
elastic_lr = ElasticNet(alpha = elastic.alpha_, l1_ratio = elastic.l1_ratio)
elastic_lr.fit(x_train, y_train)

In [129]:
elastic_lr.score(x_test, y_test)

0.8317238086196315