# Data cleanup

Implementing gradient descent to train a network on graduate school admissions data

### Loading the data

In [9]:
import numpy as np
import pandas as pd

In [10]:
admissions = pd.read_csv('binary.csv')
admissions[10:]

Unnamed: 0,admit,gre,gpa,rank
10,0,800,4.00,4
11,0,440,3.22,1
12,1,760,4.00,1
13,0,700,3.08,2
14,1,700,4.00,1
...,...,...,...,...
395,0,620,4.00,2
396,0,560,3.04,3
397,0,460,2.63,2
398,0,700,3.65,2


### One-Hot Encoding

We need to use dummy variables to encode rank, splitting the data into four new columns encoded with ones or zeros.

In [15]:
# Make dummy variables for rank
data = pd.concat([admissions, 
                  pd.get_dummies(admissions['rank'], 
                                 prefix='rank')], 
                 axis=1)
data[10:]

Unnamed: 0,admit,gre,gpa,rank,rank_1,rank_2,rank_3,rank_4
10,0,800,4.00,4,0,0,0,1
11,0,440,3.22,1,1,0,0,0
12,1,760,4.00,1,1,0,0,0
13,0,700,3.08,2,0,1,0,0
14,1,700,4.00,1,1,0,0,0
...,...,...,...,...,...,...,...,...
395,0,620,4.00,2,0,1,0,0
396,0,560,3.04,3,0,0,1,0
397,0,460,2.63,2,0,1,0,0
398,0,700,3.65,2,0,1,0,0


In [16]:
data = data.drop('rank', axis=1)
data[10:]

Unnamed: 0,admit,gre,gpa,rank_1,rank_2,rank_3,rank_4
10,0,800,4.00,0,0,0,1
11,0,440,3.22,1,0,0,0
12,1,760,4.00,1,0,0,0
13,0,700,3.08,0,1,0,0
14,1,700,4.00,1,0,0,0
...,...,...,...,...,...,...,...
395,0,620,4.00,0,1,0,0
396,0,560,3.04,0,0,1,0
397,0,460,2.63,0,1,0,0
398,0,700,3.65,0,1,0,0


### Scaling the data: Z-score

In [18]:
# Standarize features
for field in ['gre', 'gpa']:
    mean, std = data[field].mean(), data[field].std()
    data.loc[:,field] = (data[field]-mean)/std          # to get Z-score
data[:10]

Unnamed: 0,admit,gre,gpa,rank_1,rank_2,rank_3,rank_4
0,0,-1.798011,0.578348,0,0,1,0
1,1,0.625884,0.736008,0,0,1,0
2,1,1.837832,1.603135,1,0,0,0
3,1,0.452749,-0.525269,0,0,0,1
4,0,-0.586063,-1.208461,0,0,0,1
5,1,1.491561,-1.024525,0,1,0,0
6,1,-0.239793,-1.077078,1,0,0,0
7,0,-1.624876,-0.814312,0,1,0,0
8,1,-0.412928,0.000263,0,0,1,0
9,0,0.972155,1.392922,0,1,0,0


### Spiliting the data into Training and Testing

In [25]:
# Split off random 10% of the data for testing
np.random.seed(42)

sample = np.random.choice(data.index, 
                          size=int(len(data)*0.9), 
                          replace=False)

data, test_data = data.ix[sample], data.drop(sample)     # train data, test data

In [26]:
print("Number of training samples is", len(data))       
print("Number of testing samples is", len(test_data))

Number of training samples is 291
Number of testing samples is 33


In [36]:
print("train_data: ", "\n", data[:10], "\n")
print("test_data: ", "\n", test_data[:10])

train_data:  
      admit       gre       gpa  rank_1  rank_2  rank_3  rank_4
329      0 -0.759199 -1.208461       0       0       0       1
326      0  0.799020 -0.209950       0       1       0       0
66       0  1.318426  0.604625       0       0       0       1
24       1  1.491561 -0.104844       0       1       0       0
205      1  1.664697  1.077603       0       0       1       0
105      1  1.318426 -1.103354       0       1       0       0
221      0  0.799020  1.576859       0       0       1       0
60       1  0.279614 -0.551546       0       1       0       0
183      1 -0.412928  1.182710       0       1       0       0
340      0 -0.759199 -0.420163       0       0       0       1 

test_data:  
      admit       gre       gpa  rank_1  rank_2  rank_3  rank_4
36       0 -0.066657 -0.367610       1       0       0       0
253      1 -0.412928  0.420688       0       0       0       1
382      0 -0.412928  0.315582       0       1       0       0
213      0  0.452749 -0.

### Split into features and targets

In [37]:
features, targets = data.drop('admit', axis=1), data['admit']
features_test, targets_test = test_data.drop('admit', axis=1), test_data['admit']

In [40]:
print(features[:10])
print(targets[:10])

          gre       gpa  rank_1  rank_2  rank_3  rank_4
329 -0.759199 -1.208461       0       0       0       1
326  0.799020 -0.209950       0       1       0       0
66   1.318426  0.604625       0       0       0       1
24   1.491561 -0.104844       0       1       0       0
205  1.664697  1.077603       0       0       1       0
105  1.318426 -1.103354       0       1       0       0
221  0.799020  1.576859       0       0       1       0
60   0.279614 -0.551546       0       1       0       0
183 -0.412928  1.182710       0       1       0       0
340 -0.759199 -0.420163       0       0       0       1
329    0
326    0
66     0
24     1
205    1
105    1
221    0
60     1
183    1
340    0
Name: admit, dtype: int64
