# Data cleanup

##### Steps
1. Loading the data
2. One-Hot Encoding 
3. Scaling the data 
4. Spiliting the data into Training and Testing 
5. Split into Features and Targets

Implementing gradient descent to train a network on graduate school admissions data

### Loading the data

In [10]:
import numpy as np
import pandas as pd

In [11]:
admissions = pd.read_csv('binary.csv')
admissions[:10]

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4
5,1,760,3.0,2
6,1,560,2.98,1
7,0,400,3.08,2
8,1,540,3.39,3
9,0,700,3.92,2


### One-Hot Encoding

We need to use dummy variables to encode rank, splitting the data into four new columns encoded with ones or zeros.

In [12]:
# Make dummy variables for rank
data = pd.concat([admissions, 
                  pd.get_dummies(admissions['rank'], 
                                 prefix='rank')], 
                 axis=1)
data[:10]

Unnamed: 0,admit,gre,gpa,rank,rank_1,rank_2,rank_3,rank_4
0,0,380,3.61,3,0,0,1,0
1,1,660,3.67,3,0,0,1,0
2,1,800,4.0,1,1,0,0,0
3,1,640,3.19,4,0,0,0,1
4,0,520,2.93,4,0,0,0,1
5,1,760,3.0,2,0,1,0,0
6,1,560,2.98,1,1,0,0,0
7,0,400,3.08,2,0,1,0,0
8,1,540,3.39,3,0,0,1,0
9,0,700,3.92,2,0,1,0,0


In [13]:
data = data.drop('rank', axis=1)
data[:10]

Unnamed: 0,admit,gre,gpa,rank_1,rank_2,rank_3,rank_4
0,0,380,3.61,0,0,1,0
1,1,660,3.67,0,0,1,0
2,1,800,4.0,1,0,0,0
3,1,640,3.19,0,0,0,1
4,0,520,2.93,0,0,0,1
5,1,760,3.0,0,1,0,0
6,1,560,2.98,1,0,0,0
7,0,400,3.08,0,1,0,0
8,1,540,3.39,0,0,1,0
9,0,700,3.92,0,1,0,0


### Scaling the data: Z-score

In [14]:
# Standarize features
for field in ['gre', 'gpa']:
    mean, std = data[field].mean(), data[field].std()
    data.loc[:,field] = (data[field]-mean)/std          # to get Z-score
data[:10]

Unnamed: 0,admit,gre,gpa,rank_1,rank_2,rank_3,rank_4
0,0,-1.798011,0.578348,0,0,1,0
1,1,0.625884,0.736008,0,0,1,0
2,1,1.837832,1.603135,1,0,0,0
3,1,0.452749,-0.525269,0,0,0,1
4,0,-0.586063,-1.208461,0,0,0,1
5,1,1.491561,-1.024525,0,1,0,0
6,1,-0.239793,-1.077078,1,0,0,0
7,0,-1.624876,-0.814312,0,1,0,0
8,1,-0.412928,0.000263,0,0,1,0
9,0,0.972155,1.392922,0,1,0,0


### Spiliting the data into Training and Testing

In [15]:
# Split off random 10% of the data for testing
np.random.seed(42)

sample = np.random.choice(data.index, 
                          size=int(len(data)*0.9), 
                          replace=False)

data, test_data = data.ix[sample], data.drop(sample)     # train data, test data

In [16]:
print("Number of training samples is", len(data))       
print("Number of testing samples is", len(test_data))

Number of training samples is 360
Number of testing samples is 40


In [17]:
print("train_data: ", "\n", data[:10], "\n")
print("test_data: ", "\n", test_data[:10])

train_data:  
      admit       gre       gpa  rank_1  rank_2  rank_3  rank_4
209      0 -0.066657  0.289305       0       1       0       0
280      0  0.625884  1.445476       0       1       0       0
33       1  1.837832  1.603135       0       0       1       0
210      0  1.318426 -0.131120       0       0       0       1
93       0 -0.066657 -1.208461       0       1       0       0
84       1 -0.759199  0.552071       0       0       1       0
329      0 -0.759199 -1.208461       0       0       0       1
94       1  0.625884  0.131646       0       1       0       0
266      0 -0.239793 -0.393886       0       0       0       1
126      1  0.106478  0.394412       1       0       0       0 

test_data:  
      admit       gre       gpa  rank_1  rank_2  rank_3  rank_4
20       0 -0.759199 -0.577822       0       0       1       0
21       1  0.625884  0.630901       0       1       0       0
48       0 -1.278605 -2.390908       0       0       0       1
50       0  0.452749  1.

### Split into features and targets

In [18]:
features, targets = data.drop('admit', axis=1), data['admit']
features_test, targets_test = test_data.drop('admit', axis=1), test_data['admit']

In [19]:
print(features[:10])
print(targets[:10])

          gre       gpa  rank_1  rank_2  rank_3  rank_4
209 -0.066657  0.289305       0       1       0       0
280  0.625884  1.445476       0       1       0       0
33   1.837832  1.603135       0       0       1       0
210  1.318426 -0.131120       0       0       0       1
93  -0.066657 -1.208461       0       1       0       0
84  -0.759199  0.552071       0       0       1       0
329 -0.759199 -1.208461       0       0       0       1
94   0.625884  0.131646       0       1       0       0
266 -0.239793 -0.393886       0       0       0       1
126  0.106478  0.394412       1       0       0       0
209    0
280    0
33     1
210    0
93     0
84     1
329    0
94     1
266    0
126    1
Name: admit, dtype: int64
