# Normalize

This script does very simple normalization of the data from CollegeData. It ultimately saves two CSV files:

* `collegedata_unnormalized.csv` This is the merge of the three tables without any normalization
* `collegedata_normalized.csv` The merge of the three tables with some normalization

In [1]:
import TIdatabase as ti
%matplotlib inline 
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import statsmodels.api as sm
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from matplotlib import rcParams
pd.options.display.max_columns = 100

In [2]:
#see database.ipynb for more information
colleges = ti.College()

In [3]:
students = ti.Student()
students.read("scrape/collegedata/collegedata_students.csv")

In [4]:
applForm = ti.ApplForm()
applForm.read("scrape/collegedata/collegedata_applications.csv")

In [5]:
# Merge all three tables into a single dataframe
applications = pd.merge(students.df, applForm.df)
applications = pd.merge(applications, colleges.df)
applications.head(10)

Unnamed: 0,studentID,classrank,admissionstest,AP,averageAP,SATsubject,GPA,GPA_w,program,schooltype,intendedgradyear,addInfo,canAfford,female,MinorityGender,MinorityRace,international,firstinfamily,sports,artist,workexp,collegeID,earlyAppl,visited,alumni,outofstate,acceptStatus,acceptProb,name,acceptrate,size,public,finAidPct,instatePct
0,S50C3UECT8,,2290,7,5.0,3,3.8,4.34,Biomedical engineering,-1,2017,Basketball outside of school violin cancer awa...,0,1,-1,-1,-1,,-1,,,Rice,-1,,-1,-1,1,,Rice,0.151,6621,-1,0,0
1,GBWZQQRBEV,,2260,7,4.714286,2,3.94,4.47,Classics,-1,2013,Brain Bowl President and Captain 3 years Mu Al...,0,-1,-1,-1,-1,,-1,,,Rice,-1,,-1,1,1,,Rice,0.151,6621,-1,0,0
2,MXXLWO1HQ2,,2180,0,,3,3.92,3.92,Biological Science,1,2016,Ballet 4 Years Bhangra 6 Years Volunteer WildC...,0,1,-1,1,-1,,-1,,,Rice,-1,,-1,1,-1,,Rice,0.151,6621,-1,0,0
3,5KSL7C8SLZ,,2370,7,4.857143,4,3.86,4.17,Physics,1,2015,,0,-1,-1,-1,-1,,-1,,,Rice,-1,,-1,1,-1,,Rice,0.151,6621,-1,0,0
4,RQWLNGGZ49,,2200,1,4.0,2,3.95,,,1,2011,4 yrs Varsity Wrestling 2x Team Captain Academ...,0,-1,-1,-1,-1,,-1,,,Rice,1,,-1,1,1,,Rice,0.151,6621,-1,0,0
5,A7WDLWR2VM,,2200,0,,2,4.0,,Political Science,1,2016,,0,1,-1,-1,1,,-1,,,Rice,-1,,-1,1,0,,Rice,0.151,6621,-1,0,0
6,CECT9K8GDY,,2270,0,,2,3.9,4.4,Computer Science,-1,2017,,0,-1,-1,-1,-1,,-1,,,Rice,-1,,-1,1,-1,,Rice,0.151,6621,-1,0,0
7,9PM1B51CXG,,1700,3,3.666667,2,3.7,4.2,,-1,2016,,0,-1,-1,1,-1,,-1,,,Rice,-1,,-1,1,-1,,Rice,0.151,6621,-1,0,0
8,G14LB2SV7O,,2140,7,4.0,0,3.94,3.94,Business,-1,2011,,0,-1,-1,-1,-1,,-1,,,Rice,-1,,-1,-1,-1,,Rice,0.151,6621,-1,0,0
9,PSBRN09QGH,,2120,4,4.25,2,4.0,5.0,Computer Science,-1,2016,,0,-1,-1,-1,-1,,-1,,,Rice,1,,-1,1,1,,Rice,0.151,6621,-1,0,0


In [6]:
applications.to_csv("collegedata_unnormalized.csv")

In [7]:
print applications.columns

cols_to_norm = ['admissionstest', 'averageAP', 'GPA', 'GPA_w']
indicator_cols = ["schooltype","canAfford","female","MinorityGender","MinorityRace","international","firstinfamily","sports","artist","workexp","earlyAppl","visited","alumni","outofstate","acceptStatus","public"]

meandict = {}
stddict={}

for col in cols_to_norm:
    meandict[col] = np.mean(applications[col])
    stddict[col] = np.std(applications[col])
print meandict
print stddict

Index([u'studentID', u'classrank', u'admissionstest', u'AP', u'averageAP',
       u'SATsubject', u'GPA', u'GPA_w', u'program', u'schooltype',
       u'intendedgradyear', u'addInfo', u'canAfford', u'female',
       u'MinorityGender', u'MinorityRace', u'international', u'firstinfamily',
       u'sports', u'artist', u'workexp', u'collegeID', u'earlyAppl',
       u'visited', u'alumni', u'outofstate', u'acceptStatus', u'acceptProb',
       u'name', u'acceptrate', u'size', u'public', u'finAidPct',
       u'instatePct'],
      dtype='object')
{'averageAP': 4.249003971472805, 'admissionstest': 2143.606872126258, 'GPA': 3.838660991674003, 'GPA_w': 4.317342819580864}
{'averageAP': 0.7032275578051195, 'admissionstest': 174.5042600523976, 'GPA': 0.20553017343332664, 'GPA_w': 0.38354906227331065}


In [8]:
#http://stackoverflow.com/questions/10373247/how-do-i-write-a-python-dictionary-to-a-csv-file
import csv

with open("normalize_means.csv","wb") as f:
    w = csv.DictWriter(f,meandict.keys())
    w.writeheader()
    w.writerow(meandict)
with open("normalize_stds.csv","wb") as f:
    w = csv.DictWriter(f,stddict.keys())
    w.writeheader()
    w.writerow(stddict)

In [9]:
# Now normalize specific columns
# Ref: http://stackoverflow.com/questions/28576540/how-can-i-normalize-the-data-in-a-range-of-columns-in-my-pandas-dataframe

def clean_indicator(col):
    yvec=[]
    for i in range(len(col)):
        if col[i]==0:
            y=np.nan
        elif col[i]==-1:
            y=0
        else:
            y=col[i]
        yvec.append(y)
    return yvec
def normalise(col):
    return (col-np.mean(col)) / np.std(col)
applications[cols_to_norm] = applications[cols_to_norm].apply(normalise)
applications[indicator_cols] = applications[indicator_cols].apply(clean_indicator)
print applications.shape
applications.head(10)

(16094, 34)


Unnamed: 0,studentID,classrank,admissionstest,AP,averageAP,SATsubject,GPA,GPA_w,program,schooltype,intendedgradyear,addInfo,canAfford,female,MinorityGender,MinorityRace,international,firstinfamily,sports,artist,workexp,collegeID,earlyAppl,visited,alumni,outofstate,acceptStatus,acceptProb,name,acceptrate,size,public,finAidPct,instatePct
0,S50C3UECT8,,0.838909,7,1.067927,3,-0.188104,0.059072,Biomedical engineering,0,2017,Basketball outside of school violin cancer awa...,,1,0,0,0,,0,,,Rice,0,,0,0,1.0,,Rice,0.151,6621,0,0,0
1,GBWZQQRBEV,,0.666993,7,0.661638,2,0.493061,0.398012,Classics,0,2013,Brain Bowl President and Captain 3 years Mu Al...,,0,0,0,0,,0,,,Rice,0,,0,1,1.0,,Rice,0.151,6621,0,0,0
2,MXXLWO1HQ2,,0.208552,0,,3,0.395752,-1.035963,Biological Science,1,2016,Ballet 4 Years Bhangra 6 Years Volunteer WildC...,,1,0,1,0,,0,,,Rice,0,,0,1,0.0,,Rice,0.151,6621,0,0,0
3,5KSL7C8SLZ,,1.29735,7,0.864783,4,0.103824,-0.384156,Physics,1,2015,,,0,0,0,0,,0,,,Rice,0,,0,1,0.0,,Rice,0.151,6621,0,0,0
4,RQWLNGGZ49,,0.323162,1,-0.354087,2,0.541716,,,1,2011,4 yrs Varsity Wrestling 2x Team Captain Academ...,,0,0,0,0,,0,,,Rice,1,,0,1,1.0,,Rice,0.151,6621,0,0,0
5,A7WDLWR2VM,,0.323162,0,,2,0.784989,,Political Science,1,2016,,,1,0,0,1,,0,,,Rice,0,,0,1,,,Rice,0.151,6621,0,0,0
6,CECT9K8GDY,,0.724298,0,,2,0.298443,0.215506,Computer Science,0,2017,,,0,0,0,0,,0,,,Rice,0,,0,1,0.0,,Rice,0.151,6621,0,0,0
7,9PM1B51CXG,,-2.542098,3,-0.828092,2,-0.67465,-0.30594,,0,2016,,,0,0,1,0,,0,,,Rice,0,,0,1,0.0,,Rice,0.151,6621,0,0,0
8,G14LB2SV7O,,-0.020669,7,-0.354087,0,0.493061,-0.983819,Business,0,2011,,,0,0,0,0,,0,,,Rice,0,,0,0,0.0,,Rice,0.151,6621,0,0,0
9,PSBRN09QGH,,-0.13528,4,0.001416,2,0.784989,1.779843,Computer Science,0,2016,,,0,0,0,0,,0,,,Rice,1,,0,1,1.0,,Rice,0.151,6621,0,0,0


In [10]:
applications.to_csv("collegedata_normalized.csv")