# CODING TASK #1: UNDERSTAND THE PROBLEM STATEMENT AND BUSINESS CASE

- The objective of this project is to build, train, test and deploy a machine learning model to predict chances of university admission into a particular university given student’s profile.
- This project can be effectively used by university admission departments to determine top qualifying students. 
- INPUTS (FEATURES):
    - GRE Scores (out of 340)
    - TOEFL Scores (out of 120)
    - University Rating (out of 5)
    - Statement of Purpose (SOP) 
    - Letter of Recommendation (LOR) Strength (out of 5)
    - Undergraduate GPA (out of 10)
    - Research Experience (either 0 or 1)

- OUTPUTS:
    - Chance of admission (ranging from 0 to 1)

In [4]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px # Interactive Data Visualization



In [5]:
# Read the CSV file 
university_df = pd.read_csv("university_admission.csv")

In [6]:
# Load the top 6 instances
university_df.head(6)

Unnamed: 0,GRE_Score,TOEFL_Score,University_Rating,SOP,LOR,CGPA,Research,Chance_of_Admission
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.0,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.8
4,314,103,2,2.0,3.0,8.21,0,0.65
5,330,115,5,4.5,3.0,9.34,1,0.9


In [7]:
university_df.columns

Index(['GRE_Score', 'TOEFL_Score', 'University_Rating', 'SOP', 'LOR', 'CGPA',
       'Research', 'Chance_of_Admission'],
      dtype='object')

In [8]:
X = university_df.drop(columns = ['Chance_of_Admission'])

In [9]:
y = university_df['Chance_of_Admission']

In [10]:
X.shape

(1000, 7)

In [11]:
y.shape

(1000,)

In [12]:
X = np.array(X)
y = np.array(y)

In [13]:
# reshaping the array from (500,) to (500, 1)
y = y.reshape(-1,1)
y.shape

(1000, 1)

In [14]:
# spliting the data into training, testing and validation sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, test_size = 0.5)


In [15]:
X_train.shape

(600, 7)

In [16]:
X_test.shape

(200, 7)

In [17]:
X_validation.shape

(200, 7)

In [18]:
train_data = pd.DataFrame({'Target': y_train[:,0]})
train_data

Unnamed: 0,Target
0,0.89
1,0.80
2,0.42
3,0.68
4,0.63
...,...
595,0.82
596,0.76
597,0.71
598,0.63


In [19]:
# Convert the array into dataframe in a way that target variable is set as the first column and followed by feature columns
# This is because sagemaker built-in algorithm expects the data in this format.
train_data = pd.DataFrame({'Target': y_train[:,0]})
for i in range(X_train.shape[1]):
    train_data[i] = X_train[:,i]
    
train_data

Unnamed: 0,Target,0,1,2,3,4,5,6
0,0.89,334.0,117.0,5.0,4.0,4.5,9.07,1.0
1,0.80,324.0,112.0,4.0,4.0,3.5,8.77,1.0
2,0.42,299.0,94.0,1.0,1.0,1.0,7.34,0.0
3,0.68,319.0,102.0,3.0,2.5,2.5,8.37,0.0
4,0.63,300.0,102.0,3.0,3.5,2.5,8.17,0.0
...,...,...,...,...,...,...,...,...
595,0.82,324.0,111.0,4.0,3.0,3.0,9.01,1.0
596,0.76,309.0,99.0,3.0,4.0,4.0,8.56,0.0
597,0.71,318.0,103.0,3.0,4.0,4.5,8.49,1.0
598,0.63,318.0,111.0,3.0,4.0,3.0,8.80,0.0


In [20]:
val_data = pd.DataFrame({'Target':y_validation[:,0]})
val_data

Unnamed: 0,Target
0,0.79
1,0.86
2,0.96
3,0.70
4,0.96
...,...
195,0.54
196,0.85
197,0.78
198,0.74


In [21]:
val_data = pd.DataFrame({'Target':y_validation[:,0]})
for i in range(X_validation.shape[1]):
    val_data[i] = X_validation[:,i]
    
val_data

Unnamed: 0,Target,0,1,2,3,4,5,6
0,0.79,325.0,110.0,4.0,4.5,4.0,8.96,1.0
1,0.86,322.0,110.0,4.0,4.0,5.0,9.13,1.0
2,0.96,333.0,119.0,5.0,5.0,4.5,9.78,1.0
3,0.70,308.0,110.0,4.0,3.5,3.0,8.60,0.0
4,0.96,333.0,119.0,5.0,5.0,4.5,9.78,1.0
...,...,...,...,...,...,...,...,...
195,0.54,299.0,96.0,2.0,1.5,2.0,7.86,0.0
196,0.85,322.0,110.0,3.0,4.0,5.0,8.64,1.0
197,0.78,322.0,104.0,3.0,3.5,4.0,8.84,1.0
198,0.74,319.0,106.0,3.0,3.5,2.5,8.33,1.0


In [22]:
test_data = pd.DataFrame({'Target':y_test[:,0]})
for i in range(X_test.shape[1]):
    test_data[i] = X_test[:,i]
    
test_data

Unnamed: 0,Target,0,1,2,3,4,5,6
0,0.42,304.0,100.0,4.0,1.5,2.5,7.84,0.0
1,0.67,304.0,102.0,2.0,3.0,4.0,8.73,0.0
2,0.82,324.0,110.0,4.0,4.5,4.0,9.15,1.0
3,0.65,300.0,97.0,2.0,3.0,3.0,8.10,1.0
4,0.64,314.0,102.0,2.0,2.0,2.5,8.24,0.0
...,...,...,...,...,...,...,...,...
195,0.71,310.0,105.0,2.0,3.0,3.5,8.01,0.0
196,0.68,301.0,104.0,3.0,3.5,4.0,8.12,1.0
197,0.45,323.0,108.0,3.0,3.5,3.0,8.60,0.0
198,0.69,295.0,101.0,2.0,2.5,2.0,7.86,0.0


In [23]:
# save train_data and validation_data as csv files.

train_data.to_csv('train.csv', header = False, index = False)
val_data.to_csv('validation.csv', header = False, index = False)
test_data.to_csv('test.csv', header = False, index = False)
