In [72]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, LassoCV, Ridge, RidgeCV, ElasticNet, ElasticNetCV, LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
import pickle

#LassoCV ---> RidgeCV ---> ElasticNetCV ---> #Here CV means cross validation

###### pickle is a module in Python that is used to serialize and deserialize Python objects. Serialization is the process of converting a Python object into a byte stream, allowing you to save it to a file or send it over a network. Deserialization is the reverse process, where you convert the byte stream back into a Python object.

In [4]:
df = pd.read_csv("Admission_Predict.csv")
df

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.00,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.80
4,5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
495,496,332,108,5,4.5,4.0,9.02,1,0.87
496,497,337,117,5,5.0,5.0,9.87,1,0.96
497,498,330,120,5,4.5,5.0,9.56,1,0.93
498,499,312,103,4,4.0,5.0,8.43,0,0.73


In [15]:
yf = ProfileReport(df)
yf.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [6]:
#usig Imputation handle missing values
df["GRE Score"] = df["GRE Score"].fillna(df["GRE Score"].mean())
df["TOEFL Score"] = df["TOEFL Score"].fillna(df["TOEFL Score"].mean())
df["University Rating"] = df["University Rating"].fillna(df["University Rating"].mean())

In [7]:
df.describe()

In [8]:
df.isnull().sum()

In [9]:
#drop the Serial No. columns
df.drop(columns = ["Serial No."], inplace = True)

In [10]:
df

In [11]:
#create label columns
y = df["Chance of Admit "]
y

In [50]:
#create features columns
x = df.drop(columns = ["Chance of Admit "])
x

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
0,337,118,4,4.5,4.5,9.65,1
1,324,107,4,4.0,4.5,8.87,1
2,316,104,3,3.0,3.5,8.00,1
3,322,110,3,3.5,2.5,8.67,1
4,314,103,2,2.0,3.0,8.21,0
...,...,...,...,...,...,...,...
495,332,108,5,4.5,4.0,9.02,1
496,337,117,5,5.0,5.0,9.87,1
497,330,120,5,4.5,5.0,9.56,1
498,312,103,4,4.0,5.0,8.43,0


In [13]:
#we will perfom some normalization and standardization 
scaler = StandardScaler()

In [19]:
arr = scaler.fit_transform(x)
arr

array([[ 1.81923762,  1.77886545,  0.77558214, ...,  1.09894429,
         1.77680627,  0.88640526],
       [ 0.66714832, -0.03160087,  0.77558214, ...,  1.09894429,
         0.48585943,  0.88640526],
       [-0.0418297 , -0.52536441, -0.09979274, ...,  0.01730621,
        -0.95404281,  0.88640526],
       ...,
       [ 1.19888185,  2.10804114,  1.65095702, ...,  1.63976333,
         1.62785086,  0.88640526],
       [-0.39631872, -0.68995225,  0.77558214, ...,  1.63976333,
        -0.24236699, -1.12815215],
       [ 0.93301508,  0.95592621,  0.77558214, ...,  1.09894429,
         0.76721964, -1.12815215]])

In [20]:
df1 = pd.DataFrame(arr)
df1

Unnamed: 0,0,1,2,3,4,5,6
0,1.819238,1.778865,0.775582,1.137360,1.098944,1.776806,0.886405
1,0.667148,-0.031601,0.775582,0.632315,1.098944,0.485859,0.886405
2,-0.041830,-0.525364,-0.099793,-0.377773,0.017306,-0.954043,0.886405
3,0.489904,0.462163,-0.099793,0.127271,-1.064332,0.154847,0.886405
4,-0.219074,-0.689952,-0.975168,-1.387862,-0.523513,-0.606480,-1.128152
...,...,...,...,...,...,...,...
495,1.376126,0.132987,1.650957,1.137360,0.558125,0.734118,0.886405
496,1.819238,1.614278,1.650957,1.642404,1.639763,2.140919,0.886405
497,1.198882,2.108041,1.650957,1.137360,1.639763,1.627851,0.886405
498,-0.396319,-0.689952,0.775582,0.632315,1.639763,-0.242367,-1.128152


In [26]:
yf2 = ProfileReport(df1)
yf2.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [27]:
df1.describe()

Unnamed: 0,0,1,2,3,4,5,6
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,1.762146e-15,1.136868e-15,1.421085e-16,-8.526513e-17,4.2632560000000003e-17,3.119283e-15,-7.81597e-17
std,1.001002,1.001002,1.001002,1.001002,1.001002,1.001002,1.001002
min,-2.346008,-2.500419,-1.850542,-2.39795,-2.686789,-2.940115,-1.128152
25%,-0.7508077,-0.6899523,-0.9751676,-0.8828175,-0.5235128,-0.7430227,-1.128152
50%,0.04679255,-0.03160087,-0.09979274,0.1272712,0.01730621,-0.02720919,0.8864053
75%,0.7557706,0.7913384,0.7755821,0.6323155,0.5581253,0.7672196,0.8864053
max,2.085104,2.108041,1.650957,1.642404,1.639763,2.223672,0.8864053


In [31]:
#check multicollinearity using VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [33]:
arr.shape

(500, 7)

In [34]:
arr.shape[0] #it will give rows

500

In [35]:
arr.shape[1] #it will give columns

7

In [52]:
vif_df = pd.DataFrame()
vif_df["vif"] = [variance_inflation_factor(arr, i) for i in range(arr.shape[1])]
vif_df["feature"] = x.columns
vif_df
#so there is no multicollinearity

Unnamed: 0,vif,feature
0,4.464249,GRE Score
1,3.904213,TOEFL Score
2,2.621036,University Rating
3,2.83521,SOP
4,2.033555,LOR
5,4.777992,CGPA
6,1.494008,Research


In [53]:
arr

array([[ 1.81923762,  1.77886545,  0.77558214, ...,  1.09894429,
         1.77680627,  0.88640526],
       [ 0.66714832, -0.03160087,  0.77558214, ...,  1.09894429,
         0.48585943,  0.88640526],
       [-0.0418297 , -0.52536441, -0.09979274, ...,  0.01730621,
        -0.95404281,  0.88640526],
       ...,
       [ 1.19888185,  2.10804114,  1.65095702, ...,  1.63976333,
         1.62785086,  0.88640526],
       [-0.39631872, -0.68995225,  0.77558214, ...,  1.63976333,
        -0.24236699, -1.12815215],
       [ 0.93301508,  0.95592621,  0.77558214, ...,  1.09894429,
         0.76721964, -1.12815215]])

In [67]:
#split the data set
x_train,x_test, y_train, y_test = train_test_split(arr, y, test_size = 0.25, random_state = 345) #it will return 4 tuples
#using random_state we can fixed the randomness

In [68]:
x_train

array([[-0.39631872, -0.68995225,  0.77558214, ...,  1.63976333,
        -0.24236699, -1.12815215],
       [-1.63703027, -1.01912795,  0.77558214, ...,  1.09894429,
        -1.46711143,  0.88640526],
       [-0.39631872,  0.13298698, -0.09979274, ..., -0.52351283,
        -0.07686099, -1.12815215],
       ...,
       [ 0.48990382,  1.28510191,  1.65095702, ...,  1.09894429,
         1.29683885,  0.88640526],
       [-1.63703027, -1.67747933, -0.97516761, ..., -0.52351283,
        -2.26154025, -1.12815215],
       [ 1.73061537,  1.94345329,  1.65095702, ...,  0.01730621,
         2.02506527,  0.88640526]])

In [70]:
model = LinearRegression()

In [71]:
model.fit(x_train, y_train)

In [73]:
pickle.dump(model, open("admission_model.pickle", "wb"))

In [74]:
!dir

 Volume in drive G is programming
 Volume Serial Number is A6C7-630C

 Directory of G:\Data Science\iNeuron Full Stack Data Science Full Course\iNeuron Full Stack Data Science Full Course - Repo\05 Machine Learning\Lecture 05 - Build Model With Regularization

15/08/2024  08:24    <DIR>          .
13/08/2024  08:25    <DIR>          ..
13/08/2024  17:46    <DIR>          .ipynb_checkpoints
15/08/2024  08:24               515 admission_model.pickle
14/08/2024  09:44            15,677 Admission_Predict.csv
15/08/2024  08:24            43,406 Untitled.ipynb
               3 File(s)         59,598 bytes
               3 Dir(s)  63,772,721,152 bytes free
