# Normalize

This script does very simple normalization of the CollegeData. It ultimately saves two CSV files:

* `collegedata_unnormalized.csv` This is the merge of the three tables without any normalization
* `collegedata_normalized.csv` The merge of the three tables with some normalization

In [1]:
import TIdatabase as ti
%matplotlib inline 
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import statsmodels.api as sm
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from matplotlib import rcParams
pd.options.display.max_columns = 100

In [2]:
colleges = ti.College()

In [3]:
students = ti.Student()
students.read("scrape/collegedata/collegedata_students.csv")

In [4]:
applForm = ti.ApplForm()
applForm.read("scrape/collegedata/collegedata_applications.csv")

In [5]:
# Merge all three tables into a single dataframe
applications = pd.merge(students.df, applForm.df)
applications = pd.merge(applications, colleges.df)
applications.head(10)

Unnamed: 0,studentID,classrank,admissionstest,AP,averageAP,SATsubject,GPA,GPA_w,program,schooltype,intendedgradyear,addInfo,canAfford,female,MinorityGender,MinorityRace,international,firstinfamily,sports,artist,workexp,collegeID,earlyAppl,visited,alumni,outofstate,acceptStatus,acceptProb,name,acceptrate,size,public,finAidPct,instatePct
0,PWY05BUB4I,,2290,7,5.0,3,3.8,4.34,Biomedical engineering,-1,2017,Basketball outside of school violin cancer awa...,0,1,-1,-1,-1,,-1,,,Rice,-1,,-1,-1,1,,Rice,0.151,6621,-1,0,0
1,3UVDFVI9Z0,,2180,7,4.714286,2,3.94,4.47,Classics,-1,2013,Brain Bowl President and Captain 3 years Mu Al...,0,-1,-1,-1,-1,,-1,,,Rice,-1,,-1,1,1,,Rice,0.151,6621,-1,0,0
2,BCCBHJUP0M,,2180,0,,3,3.92,3.92,Biological Science,1,2016,Ballet 4 Years Bhangra 6 Years Volunteer WildC...,0,1,-1,1,-1,,-1,,,Rice,-1,,-1,1,-1,,Rice,0.151,6621,-1,0,0
3,WZFPWHSQMS,,2370,7,4.857143,4,3.86,4.17,Physics,1,2015,,0,-1,-1,-1,-1,,-1,,,Rice,-1,,-1,1,-1,,Rice,0.151,6621,-1,0,0
4,5W1JNQA7G0,,2200,1,4.0,2,3.95,,,1,2011,4 yrs Varsity Wrestling 2x Team Captain Academ...,0,-1,-1,-1,-1,,-1,,,Rice,1,,-1,1,1,,Rice,0.151,6621,-1,0,0
5,TWUKL79B6V,,2200,0,,2,4.0,,Political Science,1,2016,,0,1,-1,-1,1,,-1,,,Rice,-1,,-1,1,0,,Rice,0.151,6621,-1,0,0
6,1OJUGUL4LL,,2270,0,,2,3.9,4.4,Computer Science,-1,2017,,0,-1,-1,-1,-1,,-1,,,Rice,-1,,-1,1,-1,,Rice,0.151,6621,-1,0,0
7,NX2TARIB0P,,1700,3,3.666667,2,3.7,4.2,,-1,2016,,0,-1,-1,1,-1,,-1,,,Rice,-1,,-1,1,-1,,Rice,0.151,6621,-1,0,0
8,N4Y1IOID8K,,2140,7,4.0,0,3.94,3.94,Business,-1,2011,,0,-1,-1,-1,-1,,-1,,,Rice,-1,,-1,-1,-1,,Rice,0.151,6621,-1,0,0
9,911MU875UY,,2120,4,4.25,2,4.0,5.0,Computer Science,-1,2016,,0,-1,-1,-1,-1,,-1,,,Rice,1,,-1,1,1,,Rice,0.151,6621,-1,0,0


In [6]:
applications.to_csv("collegedata_unnormalized.csv")

In [7]:
# Now normalize specific columns
# Ref: http://stackoverflow.com/questions/28576540/how-can-i-normalize-the-data-in-a-range-of-columns-in-my-pandas-dataframe
print applications.columns

cols_to_norm = ['admissionstest', 'averageAP', 'SATsubject', 'GPA', 'GPA_w']
indicator_cols = ["schooltype","canAfford","female","MinorityGender","MinorityRace","international","firstinfamily","sports","artist","workexp","earlyAppl","visited","alumni","outofstate","acceptStatus","public"]
def clean_indicator(col):
    yvec=[]
    for i in range(len(col)):
        if col[i]==0:
            y=float("NaN")
        if col[i]==-1:
            y=0
        else:
            y=col[i]
        yvec.append(y)
    return yvec
def normalise(col):
    return (col-np.mean(col)) / np.std(col)
applications[cols_to_norm] = applications[cols_to_norm].apply(normalise)
applications[indicator_cols] = applications[indicator_cols].apply(clean_indicator)
print applications.shape
applications.head(10)

Index([u'studentID', u'classrank', u'admissionstest', u'AP', u'averageAP',
       u'SATsubject', u'GPA', u'GPA_w', u'program', u'schooltype',
       u'intendedgradyear', u'addInfo', u'canAfford', u'female',
       u'MinorityGender', u'MinorityRace', u'international', u'firstinfamily',
       u'sports', u'artist', u'workexp', u'collegeID', u'earlyAppl',
       u'visited', u'alumni', u'outofstate', u'acceptStatus', u'acceptProb',
       u'name', u'acceptrate', u'size', u'public', u'finAidPct',
       u'instatePct'],
      dtype='object')
(16062, 34)


Unnamed: 0,studentID,classrank,admissionstest,AP,averageAP,SATsubject,GPA,GPA_w,program,schooltype,intendedgradyear,addInfo,canAfford,female,MinorityGender,MinorityRace,international,firstinfamily,sports,artist,workexp,collegeID,earlyAppl,visited,alumni,outofstate,acceptStatus,acceptProb,name,acceptrate,size,public,finAidPct,instatePct
0,PWY05BUB4I,,0.926899,7,1.067339,0.324272,-0.18711,0.059947,Biomedical engineering,0,2017,Basketball outside of school violin cancer awa...,0,1,0,0,0,,0,,,Rice,0,,0,0,1,,Rice,0.151,6621,0,0,0
1,3UVDFVI9Z0,,0.293054,7,0.660575,-0.440777,0.493474,0.398944,Classics,0,2013,Brain Bowl President and Captain 3 years Mu Al...,0,0,0,0,0,,0,,,Rice,0,,0,1,1,,Rice,0.151,6621,0,0,0
2,BCCBHJUP0M,,0.293054,0,,0.324272,0.396247,-1.035273,Biological Science,1,2016,Ballet 4 Years Bhangra 6 Years Volunteer WildC...,0,1,0,1,0,,0,,,Rice,0,,0,1,0,,Rice,0.151,6621,0,0,0
3,WZFPWHSQMS,,1.387878,7,0.863957,1.08932,0.104569,-0.383356,Physics,1,2015,,0,0,0,0,0,,0,,,Rice,0,,0,1,0,,Rice,0.151,6621,0,0,0
4,5W1JNQA7G0,,0.408299,1,-0.356334,-0.440777,0.542087,,,1,2011,4 yrs Varsity Wrestling 2x Team Captain Academ...,0,0,0,0,0,,0,,,Rice,1,,0,1,1,,Rice,0.151,6621,0,0,0
5,TWUKL79B6V,,0.408299,0,,-0.440777,0.785152,,Political Science,1,2016,,0,1,0,0,1,,0,,,Rice,0,,0,1,0,,Rice,0.151,6621,0,0,0
6,1OJUGUL4LL,,0.811655,0,,-0.440777,0.299021,0.216407,Computer Science,0,2017,,0,0,0,0,0,,0,,,Rice,0,,0,1,0,,Rice,0.151,6621,0,0,0
7,NX2TARIB0P,,-2.472816,3,-0.830892,-0.440777,-0.673241,-0.305126,,0,2016,,0,0,0,1,0,,0,,,Rice,0,,0,1,0,,Rice,0.151,6621,0,0,0
8,N4Y1IOID8K,,0.062565,7,-0.356334,-1.970874,0.493474,-0.98312,Business,0,2011,,0,0,0,0,0,,0,,,Rice,0,,0,0,0,,Rice,0.151,6621,0,0,0
9,911MU875UY,,-0.05268,4,-0.000416,-0.440777,0.785152,1.781007,Computer Science,0,2016,,0,0,0,0,0,,0,,,Rice,1,,0,1,1,,Rice,0.151,6621,0,0,0


In [8]:
applications.to_csv("collegedata_normalized.csv")