In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Hypothesis
def h(x, theta):
    return np.dot(x,theta)

In [57]:
# Cost function
def J(X,y,theta):
    #constant
    c = (1/(2*len(X)))
    temp_0 = h(X,theta)-y.reshape(-1)
    temp_1 = temp_0.transpose()
    temp_2 = np.dot(temp_0,temp_1)
    return c*temp_2

In [58]:
# Gradient Descent
def gradient(X,y,theta,alpha):
    #constant
    c= (alpha/len(X))
    temp_0 = X.transpose()
    temp_1 = h(X,theta)-y.reshape(-1)
    temp_2 = np.dot(temp_0,temp_1)
    return theta - (c*temp_2)

In [5]:
# Normalisation
def normalisation(X, columns):
    for column in columns:
        X[column] = (X[column]-X[column].min())/(X[column].max()-X[column].min())
    return X

### Load Data and train model

In [64]:
df = pd.read_csv("Admission_Predict_Ver1.1.csv")

In [65]:
df.sample(2)

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
225,226,296,99,2,2.5,2.5,8.03,0,0.61
297,298,320,120,3,4.0,4.5,9.11,0,0.86


In [66]:
df.columns

Index(['Serial No.', 'GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
       'LOR ', 'CGPA', 'Research', 'Chance of Admit '],
      dtype='object')

we dont need 'Serial No.' and 'Chance of Admit' as feature


In [78]:
X = df.drop(['Serial No.','Chance of Admit '],axis=1)
y = df['Chance of Admit '].to_numpy()*100

In [79]:
X.sample(2)

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
268,327,113,4,4.5,5.0,9.14,0
448,312,109,2,2.5,4.0,9.02,0


Lets normalise the data

In [80]:
columns = ['GRE Score', 'TOEFL Score','CGPA']
X = normalisation(X,columns)

In [81]:
X.sample(2)

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
408,0.14,0.321429,3,2.0,4.0,0.278846,1
282,0.44,0.5,3,4.0,3.5,0.637821,1


Lets add value of "X_0" i.e 1 to X

In [82]:
ones_array = np.array([1]*len(X))
X.insert(0,"X_0",ones_array, True)

In [83]:
X.sample(2)

Unnamed: 0,X_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
378,1,0.26,0.214286,1,2.0,2.5,0.272436,0
89,1,0.52,0.607143,4,4.5,3.5,0.628205,1


In [84]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

In [85]:
from time import process_time

In [87]:
start = process_time()
# Initialize theta
theta = [0]*len(X_train.columns)
alpha = 0.005
threshold=0.1
cost_dif = J(X_train,y_train,theta)
print(f"Intial Cost: {cost_dif}")
my_iter = 0
cost_dict={}
my_key = "I_"+str(my_iter)
cost_dict[my_key] = cost_dif

while cost_dif>=threshold:
    initial_cost = J(X_train,y_train, theta)
    theta = gradient(X_train, y_train, theta, alpha)
    new_cost = J(X_train,y_train, theta)
    cost_dif = -new_cost+initial_cost
    my_key = "I_"+str(my_iter)
    cost_dict[my_key] = cost_dif
    
    my_iter += 1
    
    print()
    print(f"Iteration: {my_iter}")
    print(f"New_cost: {new_cost}")
    print(f"cost difference: {cost_dif}")
    
end = process_time()
execution_time = (end-start)*1000

print(f"Execution time: {execution_time}")
print(f"theta: {theta}")

Intial Cost: 2741.2200000000003

Iteration: 1
New_cost: 1803.536465562745
cost difference: 937.6835344372553

Iteration: 2
New_cost: 1194.666514354635
cost difference: 608.86995120811

Iteration: 3
New_cost: 799.3013538757673
cost difference: 395.36516047886767

Iteration: 4
New_cost: 542.5688457291143
cost difference: 256.732508146653

Iteration: 5
New_cost: 375.8531212430477
cost difference: 166.71572448606656

Iteration: 6
New_cost: 267.58699480825493
cost difference: 108.2661264347928

Iteration: 7
New_cost: 197.27330703026547
cost difference: 70.31368777798946

Iteration: 8
New_cost: 151.60287191715722
cost difference: 45.670435113108255

Iteration: 9
New_cost: 121.93378796532261
cost difference: 29.669083951834608

Iteration: 10
New_cost: 102.6547080814477
cost difference: 19.279079883874914

Iteration: 11
New_cost: 90.12208037681452
cost difference: 12.532627704633171

Iteration: 12
New_cost: 81.97007889780986
cost difference: 8.152001479004667

Iteration: 13
New_cost: 76.662528

In [88]:
y_pred = h(X_test, theta)

In [92]:
pd.DataFrame((y_test,y_pred))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,64.0,85.0,80.0,91.0,68.0,54.0,54.0,65.0,52.0,79.0,...,76.0,78.0,71.0,73.0,73.0,95.0,64.0,53.0,48.0,64.0
1,70.687762,89.177016,60.64183,99.983114,52.698085,61.945367,38.60329,78.900681,32.526111,69.281004,...,75.550546,67.245816,47.684557,60.085086,61.762543,87.644173,55.029727,42.048392,55.631782,55.808724


In [93]:
result = pd.DataFrame(index=X_test.index)

In [95]:
result["Actual"]=y_test
result["Predict"] = y_pred

In [96]:
result

Unnamed: 0,Actual,Predict
90,64.0,70.687762
254,85.0,89.177016
283,80.0,60.641830
445,91.0,99.983114
461,68.0,52.698085
...,...,...
372,95.0,87.644173
56,64.0,55.029727
440,53.0,42.048392
60,48.0,55.631782


In [97]:
from sklearn.metrics import r2_score
print(f"r2_score: {r2_score(y_test,y_pred)}")

r2_score: 0.12416154847456573


In [98]:
from sklearn.metrics import mean_squared_error, mean_absolute_error,explained_variance_score
print(f"r2 score: {r2_score(y_test,y_pred)}")
print(f"MSE: {mean_squared_error(y_test,y_pred)}")
print(f"MAE: {mean_absolute_error(y_test,y_pred)}")
print(f"EVS: {explained_variance_score(y_test,y_pred)}")

r2 score: 0.12416154847456573
MSE: 152.96299596278828
MAE: 10.54456288108087
EVS: 0.23938076039107625


In [99]:
# EVS is r2_score