In [2]:
import joblib
import numpy as np
import pandas as pd
import streamlit as st
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
#Reading file as pandas DataFrame
df=pd.read_csv("admission_data.csv")

In [33]:
#Checking for null and empty values
df.isna()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
495,False,False,False,False,False,False,False,False
496,False,False,False,False,False,False,False,False
497,False,False,False,False,False,False,False,False
498,False,False,False,False,False,False,False,False


In [34]:
#Statistics of the data
df.describe()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,316.472,107.192,3.114,3.374,3.484,8.57644,0.56,0.72174
std,11.295148,6.081868,1.143512,0.991004,0.92545,0.604813,0.496884,0.14114
min,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,308.0,103.0,2.0,2.5,3.0,8.1275,0.0,0.63
50%,317.0,107.0,3.0,3.5,3.5,8.56,1.0,0.72
75%,325.0,112.0,4.0,4.0,4.0,9.04,1.0,0.82
max,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


In [35]:
#Datatypes of each column in the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GRE Score          500 non-null    int64  
 1   TOEFL Score        500 non-null    int64  
 2   University Rating  500 non-null    int64  
 3   SOP                500 non-null    float64
 4   LOR                500 non-null    float64
 5   CGPA               500 non-null    float64
 6   Research           500 non-null    int64  
 7   Chance of Admit    500 non-null    float64
dtypes: float64(4), int64(4)
memory usage: 31.4 KB


In [36]:
df.columns

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA',
       'Research', 'Chance of Admit '],
      dtype='object')

In [4]:
#Renaming ambiguous column names
df=df.rename(columns={'Chance of Admit ':'Chance of Admit','LOR ':'LOR'})

In [5]:
#Dropping index column
df.reset_index(drop=True,inplace=True)

In [6]:
#Splitting data into features and target
features_X=df.drop('Chance of Admit',axis=1)
target_y=df['Chance of Admit']

In [7]:
#Splitting features(X) and target(y) into training and testing data
X_train,X_test,y_train,y_test=train_test_split(features_X,target_y,test_size=0.2,random_state=55)

In [8]:
#Scaling the data for better graph plotting
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [9]:
#Training the model
lin_model=LinearRegression()
lin_model.fit(X_train,y_train)

In [12]:
#Testing the model
#y_pred=lin_model.predict(X_test)
scaled=scaler.transform([[300,114,3,2.5,2.5,9.65,1]])
y_pred=lin_model.predict(scaled)
scaled



array([[-1.44187397,  1.14260801, -0.09827306, -0.86887588, -1.07835385,
         1.7846559 ,  0.89997486]])

In [44]:
#MSE for the Linear Regression model
mse=mean_squared_error(y_test,y_pred)*100

In [45]:
#Model Accuracy score
lin_model.score(X_test,y_test)*100

83.83832685579102

In [46]:
#Pickling the model using joblib
joblib.dump(lin_model,'GAP LinReg')

['GAP LinReg']

In [13]:
joblib.dump(scaler,'Scaler')

['Scaler']