# CODING TASK #1: UNDERSTAND THE PROBLEM STATEMENT AND BUSINESS CASE

- The objective of this project is to build, train, test and deploy a machine learning model to predict chances of university admission into a particular university given student’s profile.
- This project can be effectively used by university admission departments to determine top qualifying students. 
- INPUTS (FEATURES):
    - GRE Scores (out of 340)
    - TOEFL Scores (out of 120)
    - University Rating (out of 5)
    - Statement of Purpose (SOP) 
    - Letter of Recommendation (LOR) Strength (out of 5)
    - Undergraduate GPA (out of 10)
    - Research Experience (either 0 or 1)

- OUTPUTS:
    - Chance of admission (ranging from 0 to 1)

In [15]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px # Interactive Data Visualization

In [16]:
# Read the CSV file 
university_df = pd.read_csv("university_admission.csv")

In [17]:
# Load the top 6 instances
university_df.head(6)

Unnamed: 0,GRE_Score,TOEFL_Score,University_Rating,SOP,LOR,CGPA,Research,Chance_of_Admission
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.0,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.8
4,314,103,2,2.0,3.0,8.21,0,0.65
5,330,115,5,4.5,3.0,9.34,1,0.9


In [18]:
university_df.columns

Index(['GRE_Score', 'TOEFL_Score', 'University_Rating', 'SOP', 'LOR', 'CGPA',
       'Research', 'Chance_of_Admission'],
      dtype='object')

In [78]:
X = university_df.drop(columns = ['Chance_of_Admission'])

In [79]:
y = university_df['Chance_of_Admission']

In [80]:
X.shape

(1000, 7)

In [81]:
y.shape

(1000,)

In [82]:
X = np.array(X)
y = np.array(y)

In [83]:
# reshaping the array from (500,) to (500, 1)
y = y.reshape(-1,1)
y.shape

(1000, 1)

In [84]:
# spliting the data into training, testing and validation sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, test_size = 0.5)


In [85]:
X_train.shape

(700, 7)

In [86]:
X_test.shape

(150, 7)

In [87]:
X_validation.shape

(150, 7)

In [88]:
train_data = pd.DataFrame({'Target': y_train[:,0]})
train_data

Unnamed: 0,Target
0,0.76
1,0.57
2,0.71
3,0.79
4,0.86
...,...
695,0.96
696,0.68
697,0.67
698,0.66


In [89]:
# Convert the array into dataframe in a way that target variable is set as the first column and followed by feature columns
# This is because sagemaker built-in algorithm expects the data in this format.
train_data = pd.DataFrame({'Target': y_train[:,0]})
for i in range(X_train.shape[1]):
    train_data[i] = X_train[:,i]
    
train_data

Unnamed: 0,Target,0,1,2,3,4,5,6
0,0.76,322.0,112.0,3.0,3.0,4.0,8.62,1.0
1,0.57,304.0,107.0,3.0,3.5,3.0,7.86,0.0
2,0.71,310.0,105.0,2.0,3.0,3.5,8.01,0.0
3,0.79,332.0,112.0,1.0,1.5,3.0,8.66,1.0
4,0.86,322.0,110.0,4.0,4.0,5.0,9.13,1.0
...,...,...,...,...,...,...,...,...
695,0.96,335.0,117.0,5.0,5.0,5.0,9.82,1.0
696,0.68,308.0,101.0,2.0,3.0,4.0,7.90,0.0
697,0.67,308.0,105.0,4.0,3.0,2.5,7.95,1.0
698,0.66,317.0,107.0,3.0,4.0,3.0,8.70,0.0


In [91]:
val_data = pd.DataFrame({'Target':y_validation[:,0]})
val_data

Unnamed: 0,Target
0,0.86
1,0.91
2,0.72
3,0.65
4,0.86
...,...
145,0.88
146,0.57
147,0.89
148,0.71


In [94]:
val_data = pd.DataFrame({'Target':y_validation[:,0]})
for i in range(X_validation.shape[1]):
    val_data[i] = X_validation[:,i]
    
val_data

Unnamed: 0,Target,0,1,2,3,4,5,6
0,0.86,329.0,114.0,5.0,4.5,5.0,9.19,1.0
1,0.91,326.0,113.0,5.0,4.5,4.0,9.40,1.0
2,0.72,310.0,103.0,2.0,2.5,2.5,8.24,0.0
3,0.65,322.0,105.0,2.0,3.0,3.0,8.45,1.0
4,0.86,329.0,114.0,5.0,4.0,5.0,9.30,1.0
...,...,...,...,...,...,...,...,...
145,0.88,324.0,112.0,5.0,5.0,5.0,9.08,1.0
146,0.57,295.0,99.0,2.0,2.5,3.0,7.65,0.0
147,0.89,329.0,113.0,4.0,4.0,3.5,9.36,1.0
148,0.71,318.0,106.0,3.0,2.0,3.0,8.65,0.0


In [95]:
test_data = pd.DataFrame({'Target':y_test[:,0]})
for i in range(X_test.shape[1]):
    test_data[i] = X_test[:,i]
    
test_data

Unnamed: 0,Target,0,1,2,3,4,5,6
0,0.71,300.0,104.0,3.0,3.5,3.0,8.16,0.0
1,0.52,325.0,111.0,3.0,3.0,3.5,8.70,0.0
2,0.56,303.0,98.0,1.0,2.0,2.5,7.65,0.0
3,0.65,317.0,103.0,2.0,2.5,2.0,8.15,0.0
4,0.71,310.0,105.0,2.0,3.0,3.5,8.01,0.0
...,...,...,...,...,...,...,...,...
145,0.71,314.0,99.0,4.0,3.5,4.5,8.73,1.0
146,0.86,331.0,120.0,3.0,4.0,4.0,8.96,1.0
147,0.74,320.0,104.0,3.0,3.5,4.5,8.34,1.0
148,0.79,307.0,110.0,4.0,4.0,4.5,8.37,0.0


In [97]:
# save train_data and validation_data as csv files.

train_data.to_csv('train.csv', header = False, index = False)
val_data.to_csv('validation.csv', header = False, index = False)
test_data.to_csv('test.csv', header = False, index = False)
