# This file: Construct predicted flows instrument from GIS data
# Dependencies: GIS-mapped CA county-to-school district stocks; raw OECD flows; raw natl decennial stocks
# Outputs: predicted_flows80, predicted_flows90
# Last updated: 2/21/2019

In [1]:
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

  from pandas.core import datetools


In [3]:
directory = '/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/intermediate'
os.chdir(directory)

Check the GIS data

In [4]:
os.listdir(directory)

['elem_scsd_map.csv',
 '1980_counties_small_tract.csv',
 'CA_1990_tabsum.csv',
 'CA_unsd_elsd_districts.csv',
 'CA_2010_tabsum.csv',
 '1980_counties_w_tract.csv',
 '1980_counties_wo_tract.csv',
 'CA_2000_tabsum.csv',
 'school_districts_clean.csv',
 'CA_1980_tab_c.csv',
 'CA_1980_tab_t.csv',
 'CA_1980_tabsum_t.csv',
 'CA_1980_tabsum_c.csv']

In [11]:
df_1980c_tab = pd.read_csv('CA_1980_tab_c.csv')
df_1980t_tab = pd.read_csv('CA_1980_tab_t.csv')
df_1980c_tabsum = pd.read_csv('CA_1980_tabsum_c.csv')
df_1980t_tabsum = pd.read_csv('CA_1980_tabsum_t.csv')
counties_tracted = pd.read_csv('1980_counties_w_tract.csv')
counties_untracted = pd.read_csv('1980_counties_wo_tract.csv')

In [12]:
df_1990 = pd.read_csv('CA_1990_tabsum.csv')
df_2000 = pd.read_csv('CA_2000_tabsum.csv')
df_2010 = pd.read_csv('CA_2010_tabsum.csv')

In [13]:
print("There are {} districts in 1980 county file".format(len(df_1980c_tab)))
print("There are {} districts in 1980 tract file".format(len(df_1980t_tab)))
for i in range(1990, 2020, 10):
    print("There are {} districts in {} file".format(len(eval("df_" + str(i))), str(i)))

There are 1545 districts in 1980 county file
There are 9764 districts in 1980 tract file
There are 870 districts in 1990 file
There are 870 districts in 2000 file
There are 870 districts in 2010 file


This means there are 870 elementary + unified school districts in 2015. Check this with the other file.

In [14]:
dist = pd.read_csv('/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/input/CA_school_districts/pubdistricts.txt', delimiter='\t')

In [15]:
doc_dict = {
    0: 'County Office of Education',
    2: 'State Board of Education',
    3: 'Statewide Benefit Charter',
    31: 'State Special Schools',
    34: 'Non-school Location',
    52: 'Elementary School District',
    54: 'Unified School District',
    56: 'High School District',
    98: 'Regional Occupation Center/Program (ROC/P)'
}

In [16]:
# Number of elementary and unified school districts 
len(dist[(dist['DOC']==52) | (dist['DOC']==54)]) 

868

## Clean 1980

In [18]:
# first just rename the variable names
df_1980c_tab.rename(columns={'CA_Asianpo':'GISJOIN',
                         'CA_Asian_7':'Total',
                         'CA_Asian_8':'White',
                         'CA_Asian_9':'Black',
                         'CA_Asia_10':'Total_AIEA',
                         'CA_Asia_11':'American Indian',
                         'CA_Asia_12':'Eskimo',
                         'CA_Asia_13':'Aleut',
                         'CA_Asia_14':'Total_API',
                         'CA_Asia_15':'Japanese',
                         'CA_Asia_16':'Chinese',
                         'CA_Asia_17':'Filipino',
                         'CA_Asia_18':'Korean',
                         'CA_Asia_19':'Asian Indian',
                         'CA_Asia_20':'Vietnamese',
                         'CA_Asia_21':'Hawaiian',
                         'CA_Asia_22':'Guamanian',
                         'CA_Asia_23':'Samoan',
                         'CA_Asia_24':'Other'}, inplace=True)
df_1980c_tab.drop(['OBJECTID', 'CA_Asian_1', 'CA_Asian_2', 'CA_Asian_3', 'CA_Asian_3', 'CA_Asian_4',
            'CA_Asian_5', 'CA_Asian_6'], axis=1, inplace=True)

In [19]:
# first just rename the variable names
df_1980c_tabsum.rename(columns={'Sum_CA_A_6':'Total',
                         'Sum_CA_A_7':'White',
                         'Sum_CA_A_8':'Black',
                         'Sum_CA_A_9':'Total_AIEA',
                         'Sum_CA__10':'American Indian',
                         'Sum_CA__11':'Eskimo',
                         'Sum_CA__12':'Aleut',
                         'Sum_CA__13':'Total_API',
                         'Sum_CA__14':'Japanese',
                         'Sum_CA__15':'Chinese',
                         'Sum_CA__16':'Filipino',
                         'Sum_CA__17':'Korean',
                         'Sum_CA__18':'Asian Indian',
                         'Sum_CA__19':'Vietnamese',
                         'Sum_CA__20':'Hawaiian',
                         'Sum_CA__21':'Guamanian',
                         'Sum_CA__22':'Samoan',
                         'Sum_CA__23':'Other'}, inplace=True)
df_1980c_tabsum.drop(['Cnt_SCHOOL', 'Sum_CA_Asi', 'Sum_CA_A_1', 'Sum_CA_A_2',
                     'Sum_CA_A_3', 'Sum_CA_A_4', 'Sum_CA_A_5'], axis=1, inplace=True)

In [20]:
# first just rename the variable names
df_1980t_tab.rename(columns={'CA_Asianpo':'GISJOIN',
                         'CA_Asian_1':'White',
                         'CA_Asian_2':'Black',
                         'CA_Asian_3':'American Indian',
                         'CA_Asian_4':'Eskimo',
                         'CA_Asian_5':'Aleut',
                         'CA_Asian_6':'Japanese',
                         'CA_Asian_7':'Chinese',
                         'CA_Asian_8':'Filipino',
                         'CA_Asian_9':'Korean',
                         'CA_Asia_10':'Asian Indian',
                         'CA_Asia_11':'Vietnamese',
                         'CA_Asia_12':'Hawaiian',
                         'CA_Asia_13':'Guamanian',
                         'CA_Asia_14':'Samoan',
                         'CA_Asia_15':'Other'}, inplace=True)
df_1980t_tab.drop('OBJECTID', axis=1, inplace=True)

In [21]:
# first just rename the variable names
df_1980t_tabsum.rename(columns={'Sum_CA_Asi':'White',
                         'Sum_CA_A_1':'Black',
                         'Sum_CA_A_2':'American Indian',
                         'Sum_CA_A_3':'Eskimo',
                         'Sum_CA_A_4':'Aleut',
                         'Sum_CA_A_5':'Japanese',
                         'Sum_CA_A_6':'Chinese',
                         'Sum_CA_A_7':'Filipino',
                         'Sum_CA_A_8':'Korean',
                         'Sum_CA_A_9':'Asian Indian',
                         'Sum_CA__10':'Vietnamese',
                         'Sum_CA__11':'Hawaiian',
                         'Sum_CA__12':'Guamanian',
                         'Sum_CA__13':'Samoan',
                         'Sum_CA__14':'Other'}, inplace=True)
df_1980t_tabsum.drop('Cnt_SCHOOL', axis=1, inplace=True)

In [22]:
df_1980t_tabsum.head(10)

Unnamed: 0,SCHOOLID,White,Black,American Indian,Eskimo,Aleut,Japanese,Chinese,Filipino,Korean,Asian Indian,Vietnamese,Hawaiian,Guamanian,Samoan,Other
0,600001,5784,122,121,0,0,15,12,11,6,3,2,1,1,0,242
1,600006,21407,165,73,1,2,102,109,38,38,31,8,14,2,3,270
2,600009,910,4,15,0,0,2,1,1,0,0,0,0,0,0,42
3,600012,1920,13,34,0,0,7,0,5,0,1,0,2,0,0,17
4,600013,8476,26,74,1,1,90,21,15,9,2,10,6,0,1,283
5,600014,21235,237,262,1,1,41,19,33,10,5,11,26,0,2,637
6,600015,10639,140,113,0,0,16,13,13,6,8,6,4,0,0,177
7,600016,44649,877,269,0,1,186,362,374,154,147,95,28,3,9,2478
8,600017,20925,312,264,1,0,61,40,127,29,7,6,25,3,1,554
9,600018,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [23]:
for df in [df_1980t_tab, df_1980t_tabsum]:
    df['Total_AIEA'] = df[['American Indian', 'Eskimo', 'Aleut']].sum(axis=1)
    df['Total_API'] = df[['Japanese', 'Chinese', 'Filipino', 'Korean',
                                    'Asian Indian', 'Vietnamese', 'Hawaiian', 'Guamanian', 'Samoan']].sum(axis=1)

In [24]:
# Tract to county mapping
df_1980t_tab['GISJOIN_C'] = df_1980t_tab['GISJOIN'].apply(lambda x: x[:8])

In [25]:
counties = pd.DataFrame(counties_tracted['GISJOIN'])
counties['tracted'] = 1

temp = pd.DataFrame(counties_untracted['GISJOIN'])
temp['tracted'] = 0

counties = counties.append(temp)

In [26]:
# if GISJOIN = tracted, then can just use df_1980t tabsum for these school districts
# if GISJOIN =/= tracted but tracted = 0, then can just use df_1980c tabsum for these school districts
# if GISJOIN =/= tracted and tracted > 0, then use counties for untracted, 
# but need to know what the tracts are in the tracted counties to get from df_1980t tabsum

df_1980c_M = df_1980c_tab.merge(counties, how='left', on='GISJOIN')
key = df_1980c_M.groupby('SCHOOLID').agg({'GISJOIN':'count', 'tracted':'sum'}).reset_index()
key['action'] = key.apply(lambda x: 'tract' if x['GISJOIN']==x['tracted'] else 'county' if x['tracted']==0 else 'combo', axis=1)

In [27]:
combo_t = df_1980t_tab[df_1980t_tab['SCHOOLID'].isin(key[key['action']=='combo']['SCHOOLID'])]
combo_c = df_1980c_tab[df_1980c_tab['SCHOOLID'].isin(key[key['action']=='combo']['SCHOOLID'])]
combo_m = combo_t.merge(combo_c, how='right', left_on='GISJOIN_C', right_on='GISJOIN', suffixes=('_t','_c'))

# groupby schoolID, and if GISJOIN_t exists, sum the _t numbers. else, sum the _c numbers.
for col in combo_m.columns.tolist()[2:17]:
    combo_m[col] = combo_m[col].fillna(combo_m[col[:-2] + '_c'])
    
# clean up
combo_m = combo_m[combo_m.columns.tolist()[2:23]]
combo_m.drop(['AREA_t', 'PERCENTAGE_t', 'GISJOIN_C'], axis=1, inplace=True)
combo_m.rename(columns=lambda x: x[:-2], inplace=True)
combo_m = combo_m.groupby('SCHOOLID').sum().reset_index()

In [28]:
df_1980 = df_1980t_tabsum[df_1980t_tabsum['SCHOOLID'].isin(key[key['action']=='tract']['SCHOOLID'])]
df_1980 = df_1980.append(df_1980c_tabsum[df_1980c_tabsum['SCHOOLID'].isin(key[key['action']=='county']['SCHOOLID'])])
df_1980 = df_1980.append(combo_m)

In [29]:
df_1980.head(10)

Unnamed: 0,Aleut,American Indian,Asian Indian,Black,Chinese,Eskimo,Filipino,Guamanian,Hawaiian,Japanese,Korean,Other,SCHOOLID,Samoan,Total,Total_AIEA,Total_API,Vietnamese,White
0,0.0,121.0,3.0,122.0,12.0,0.0,11.0,1.0,1.0,15.0,6.0,242.0,600001,0.0,,121.0,51.0,2.0,5784.0
1,2.0,73.0,31.0,165.0,109.0,1.0,38.0,2.0,14.0,102.0,38.0,270.0,600006,3.0,,76.0,345.0,8.0,21407.0
2,0.0,15.0,0.0,4.0,1.0,0.0,1.0,0.0,0.0,2.0,0.0,42.0,600009,0.0,,15.0,4.0,0.0,910.0
4,1.0,74.0,2.0,26.0,21.0,1.0,15.0,0.0,6.0,90.0,9.0,283.0,600013,1.0,,76.0,154.0,10.0,8476.0
5,1.0,262.0,5.0,237.0,19.0,1.0,33.0,0.0,26.0,41.0,10.0,637.0,600014,2.0,,264.0,147.0,11.0,21235.0
6,0.0,113.0,8.0,140.0,13.0,0.0,13.0,0.0,4.0,16.0,6.0,177.0,600015,0.0,,113.0,66.0,6.0,10639.0
7,1.0,269.0,147.0,877.0,362.0,0.0,374.0,3.0,28.0,186.0,154.0,2478.0,600016,9.0,,270.0,1358.0,95.0,44649.0
8,0.0,264.0,7.0,312.0,40.0,1.0,127.0,3.0,25.0,61.0,29.0,554.0,600017,1.0,,265.0,299.0,6.0,20925.0
10,1.0,135.0,31.0,1060.0,84.0,0.0,124.0,62.0,29.0,64.0,18.0,675.0,600019,0.0,,136.0,436.0,24.0,12445.0
11,0.0,114.0,46.0,261.0,195.0,0.0,230.0,12.0,27.0,159.0,14.0,578.0,600020,4.0,,114.0,692.0,5.0,29901.0


## Clean 1990 - 2010 variable names

In [30]:
# first just rename the variable names
df_1990.rename(columns={'Sum_CA_Asi':'White',
                        'Sum_CA_A_1':'Black',
                        'Sum_CA_A_2':'American Indian',
                        'Sum_CA_A_3':'Eskimo',
                        'Sum_CA_A_4':'Aleut',
                        'Sum_CA_A_5':'Chinese',
                        'Sum_CA_A_6':'Filipino',
                        'Sum_CA_A_7':'Japanese',
                        'Sum_CA_A_8':'Asian Indian',
                        'Sum_CA_A_9':'Korean',
                        'Sum_CA__10':'Vietnamese',
                        'Sum_CA__11':'Cambodian',
                        'Sum_CA__12':'Hmong',
                        'Sum_CA__13':'Laotian',
                        'Sum_CA__14':'Thai',
                        'Sum_CA__15':'Other Asian',
                        'Sum_CA__16':'Hawaiian',
                        'Sum_CA__17':'Samoan',
                        'Sum_CA__18':'Tongan',
                        'Sum_CA__19':'Other Polynesian',
                        'Sum_CA__20':'Guamanian',
                        'Sum_CA__21':'Other Micronesian',
                        'Sum_CA__22':'Melanesian',
                        'Sum_CA__23':'Pacific Islander, not specified',
                        'Sum_CA__24':'Other'}, inplace=True)

In [31]:
df_1990['Total_AIEA'] = df_1990[['American Indian', 'Eskimo', 'Aleut']].sum(axis=1)

df_1990['Total_Micronesian'] = df_1990[['Guamanian', 'Other Micronesian', 'Melanesian', 
                                        'Pacific Islander, not specified']].sum(axis=1)
df_1990['Total_Polynesian'] = df_1990[['Hawaiian', 'Samoan', 'Tongan', 'Other Polynesian']].sum(axis=1)

df_1990['Total_PacificIslander'] = df_1990[['Total_Polynesian', 'Total_Micronesian']].sum(axis=1)
df_1990['Total_Asian'] = df_1990[['Chinese', 'Filipino', 'Japanese', 'Asian Indian',
                                 'Korean', 'Vietnamese', 'Cambodian', 'Hmong', 'Laotian', 
                                 'Thai', 'Other Asian']].sum(axis=1)

df_1990['Total_API'] = df_1990[['Total_PacificIslander', 'Total_Asian']].sum(axis=1)

In [32]:
df_1990.head(10)

Unnamed: 0,SCHOOLID,Cnt_SCHOOL,White,Black,American Indian,Eskimo,Aleut,Chinese,Filipino,Japanese,...,Other Micronesian,Melanesian,"Pacific Islander, not specified",Other,Total_AIEA,Total_Micronesian,Total_Polynesian,Total_PacificIslander,Total_Asian,Total_API
0,600001,6,12080,411,117,0,0,33,131,43,...,0,0,0,826,117,4,15,19,315,334
1,600006,13,20365,196,104,3,0,140,74,127,...,0,0,0,184,107,2,23,25,500,525
2,600009,4,824,5,15,0,0,1,3,2,...,0,0,0,113,15,0,0,0,6,6
3,600011,3,1110,85,35,0,0,4,4,1,...,0,0,0,36,35,1,3,4,14,18
4,600012,8,2659,11,53,0,0,3,8,13,...,0,0,0,26,53,3,4,7,29,36
5,600013,7,15940,130,167,0,0,84,67,150,...,0,0,0,346,167,5,23,28,415,443
6,600014,12,48364,1302,494,1,2,38,244,137,...,8,0,0,4976,497,39,112,151,634,785
7,600015,9,10215,186,140,1,0,15,25,16,...,0,0,0,206,141,0,10,10,75,85
8,600016,19,50633,3215,284,5,1,1467,580,356,...,0,6,0,4909,290,13,81,94,4397,4491
9,600017,12,46276,1833,550,3,7,101,461,139,...,0,0,0,2879,560,13,64,77,1082,1159


In [33]:
# first just rename the variable names
# 2000: only used table FT2 (people with one asian category only)
# CHECK IF THE OTHER TABLES ARE SPARSE OR NOT!
df_2000.rename(columns={'Sum_CA_Asi':'Asian Indian',
                        'Sum_CA_A_1':'Bangladeshi',
                        'Sum_CA_A_2':'Cambodian',
                        'Sum_CA_A_3':'Chinese, except Taiwanese',
                        'Sum_CA_A_4':'Filipino',
                        'Sum_CA_A_5':'Hmong',
                        'Sum_CA_A_6':'Indonesian',
                        'Sum_CA_A_7':'Japanese',
                        'Sum_CA_A_8':'Korean',
                        'Sum_CA_A_9':'Laotian',
                        'Sum_CA__10':'Malaysian',
                        'Sum_CA__11':'Pakistani',
                        'Sum_CA__12':'Sri Lankan',
                        'Sum_CA__13':'Taiwanese',
                        'Sum_CA__14':'Thai',
                        'Sum_CA__15':'Vietnamese',
                        'Sum_CA__16':'Other Asian',
                        'Sum_CA__17':'Other Asian, not specified'}, inplace=True)

In [34]:
df_2000['Total_Asian'] = df_2000.drop(['SCHOOLID', 'Cnt_SCHOOL'], axis=1).sum(axis=1)

In [35]:
df_2000.head()

Unnamed: 0,SCHOOLID,Cnt_SCHOOL,Asian Indian,Bangladeshi,Cambodian,"Chinese, except Taiwanese",Filipino,Hmong,Indonesian,Japanese,...,Laotian,Malaysian,Pakistani,Sri Lankan,Taiwanese,Thai,Vietnamese,Other Asian,"Other Asian, not specified",Total_Asian
0,600001,12,30,0,0,28,89,0,1,42,...,0,0,0,0,3,10,6,1,4,258
1,600006,14,47,0,2,179,62,0,5,122,...,2,1,0,0,4,10,9,1,3,567
2,600009,4,0,0,0,0,3,0,0,1,...,0,0,0,0,0,0,0,0,0,4
3,600011,3,0,0,0,1,2,0,0,0,...,0,0,0,0,0,0,0,0,0,4
4,600012,6,0,0,0,1,2,0,0,4,...,0,0,0,0,0,2,1,0,1,13


In [36]:
# first just rename the variable names
# 2010: used nhgis (not American Community Survey). only table ICW 
df_2010.rename(columns={'Sum_ICW001':'Total_Asian',
                        'Sum_ICW002':'Asian Indian',
                        'Sum_ICW003':'Bangladeshi',
                        'Sum_ICW004':'Bhutanese',
                        'Sum_ICW005':'Burmese',
                        'Sum_ICW006':'Cambodian',
                        'Sum_ICW007':'Chinese, except Taiwanese',
                        'Sum_ICW008':'Filipino',
                        'Sum_ICW009':'Hmong',
                        'Sum_ICW010':'Indonesian',
                        'Sum_ICW011':'Japanese',
                        'Sum_ICW012':'Korean',
                        'Sum_ICW013':'Laotian',
                        'Sum_ICW014':'Malaysian',
                        'Sum_ICW015':'Nepalese',
                        'Sum_ICW016':'Pakistani',
                        'Sum_ICW017':'Sri Lankan',
                        'Sum_ICW018':'Taiwanese',
                        'Sum_ICW019':'Thai',
                        'Sum_ICW020':'Vietnamese',
                        'Sum_ICW021':'Other Asian',
                        'Sum_ICW022':'Other Asian, not specified'}, inplace=True)

In [37]:
df_2010.head(10)

Unnamed: 0,SCHOOLID,Cnt_SCHOOL,Total_Asian,Asian Indian,Bangladeshi,Bhutanese,Burmese,Cambodian,"Chinese, except Taiwanese",Filipino,...,Laotian,Malaysian,Nepalese,Pakistani,Sri Lankan,Taiwanese,Thai,Vietnamese,Other Asian,"Other Asian, not specified"
0,600001,12,407,49,1,0,0,7,26,182,...,0,0,1,1,4,3,8,12,0,6
1,600006,14,760,76,0,4,0,2,218,81,...,0,2,5,0,0,6,18,33,2,13
2,600009,4,13,0,0,0,0,0,3,5,...,0,0,0,0,0,0,0,0,0,0
3,600011,3,25,0,0,0,0,1,3,7,...,4,0,0,0,0,0,0,3,0,0
4,600012,6,26,3,0,0,0,0,10,2,...,0,0,0,0,0,0,1,2,0,1
5,600013,19,3556,747,6,0,14,37,549,954,...,28,5,6,41,24,38,17,312,1,62
6,600014,30,1982,231,17,0,0,83,140,636,...,5,6,0,18,6,2,24,310,2,25
7,600015,11,107,4,0,0,0,0,11,21,...,0,0,0,0,0,9,2,9,0,4
8,600016,30,6019,699,19,0,72,22,1171,1384,...,16,13,13,150,24,351,111,516,3,42
9,600017,24,2070,271,6,0,0,55,196,770,...,3,2,0,25,12,18,35,127,3,15


# Add in School District Names, save

In [42]:
names = pd.read_csv('CA_unsd_elsd_districts.csv')

In [43]:
df_1980 = df_1980.merge(names[['SCHOOLID', 'NAME']], on='SCHOOLID', how='left')
df_1990 = df_1990.merge(names[['SCHOOLID', 'NAME']], on='SCHOOLID', how='left')
df_2000 = df_2000.merge(names[['SCHOOLID', 'NAME']], on='SCHOOLID', how='left')
df_2010 = df_2010.merge(names[['SCHOOLID', 'NAME']], on='SCHOOLID', how='left')

In [53]:
df_1980.to_csv('/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/output/df_1980.csv', index=False)
df_1990.to_csv('/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/output/df_1990.csv', index=False)
df_2000.to_csv('/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/output/df_2000.csv', index=False)
df_2010.to_csv('/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/output/df_2010.csv', index=False)

# ************
# Pick up. Read here.

In [5]:
df_1980 = pd.read_csv('/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/output/df_1980.csv')
df_1990 = pd.read_csv('/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/output/df_1990.csv')
df_2000 = pd.read_csv('/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/output/df_2000.csv')
df_2010 = pd.read_csv('/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/output/df_2010.csv')

# National immigration flows: OECD data

In [7]:
df_1980_flows = df_1980[['SCHOOLID', 'Asian Indian', 'Chinese', 'Japanese', 'Korean', 'Vietnamese', 'NAME']].copy()
df_1990_flows = df_1990[['SCHOOLID', 'Asian Indian', 'Chinese', 'Japanese', 'Korean', 'Vietnamese', 'NAME']].copy()

In [8]:
df_1980_flows.head()

Unnamed: 0,SCHOOLID,Asian Indian,Chinese,Japanese,Korean,Vietnamese,NAME
0,600001,3.0,12.0,15.0,6.0,2.0,Acton-Agua Dulce Unified School District
1,600006,31.0,109.0,102.0,38.0,8.0,Ross Valley Elementary School District
2,600009,0.0,1.0,2.0,0.0,0.0,Cuyama Joint Unified School District
3,600013,2.0,21.0,90.0,9.0,10.0,Rocklin Unified School District
4,600014,5.0,19.0,41.0,10.0,11.0,Hesperia Unified School District


In [9]:
df_1990_flows.head()

Unnamed: 0,SCHOOLID,Asian Indian,Chinese,Japanese,Korean,Vietnamese,NAME
0,600001,28,33,43,35,10,Acton-Agua Dulce Unified School District
1,600006,50,140,127,55,18,Ross Valley Elementary School District
2,600009,0,1,2,0,0,Cuyama Joint Unified School District
3,600011,0,4,1,4,1,Fort Sage Unified School District
4,600012,1,3,13,2,0,Twin Ridges Elementary School District


In [10]:
oecd = pd.read_csv('/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/input/oecd_migration.csv')

In [11]:
pd.crosstab(oecd['Variable'], oecd['Country of birth/nationality'])

Country of birth/nationality,Bangladesh,China,Chinese Taipei,Democratic People's Republic of Korea,"Hong Kong, China",India,Japan,Korea,Pakistan,Viet Nam
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Acquisition of nationality by country of former nationality,20,26,26,8,17,26,20,26,26,26
Inflows of asylum seekers by nationality,21,37,0,21,19,37,21,21,37,21
Inflows of foreign population by nationality,37,37,37,8,37,37,31,37,37,37
Stock of foreign population by nationality,13,13,13,0,13,13,13,13,13,13
Stock of foreign-born labour by country of birth,15,16,16,1,16,16,16,16,15,16
Stock of foreign-born population by country of birth,22,23,23,11,23,23,23,23,22,23


In [12]:
oecd_inflows = oecd[oecd['Variable']=='Inflows of foreign population by nationality'].copy()[['Country of birth/nationality', 'Year', 'Value']]

In [13]:
oecd_inflows['Country of birth/nationality'].value_counts()

Viet Nam                                 37
India                                    37
Hong Kong, China                         37
Pakistan                                 37
China                                    37
Chinese Taipei                           37
Korea                                    37
Bangladesh                               37
Japan                                    31
Democratic People's Republic of Korea     8
Name: Country of birth/nationality, dtype: int64

In [14]:
# Pivot
oecd_inflows = oecd_inflows.pivot(index='Year', columns='Country of birth/nationality', values='Value').reset_index()
oecd_inflows.head()

Country of birth/nationality,Year,Bangladesh,China,Chinese Taipei,Democratic People's Republic of Korea,"Hong Kong, China",India,Japan,Korea,Pakistan,Viet Nam
0,1980,532.0,27651.0,1000.0,,3860.0,22607.0,,32320.0,4265.0,43483.0
1,1981,756.0,25803.0,1000.0,,4055.0,21522.0,,32663.0,5288.0,55631.0
2,1982,639.0,27100.0,9884.0,,4971.0,21738.0,,31724.0,4536.0,72553.0
3,1983,787.0,25777.0,16698.0,,5948.0,25451.0,,33339.0,4807.0,37560.0
4,1984,823.0,23363.0,12478.0,,5465.0,24964.0,,33042.0,5509.0,37236.0


In [15]:
# Manually input Japan (from the Paper Immigration Yearbook)
oecd_inflows.loc[0:5, 'Japan'] = [4225, 3896, 3903, 4092, 4043, 4086]

In [16]:
# Sum to get the ethnic categories of enrollment data
oecd_inflows['Asian Indian_inflow'] = oecd_inflows[['Bangladesh', 'India', 'Pakistan']].sum(axis=1)
oecd_inflows['Chinese_inflow'] = oecd_inflows[['China', 'Chinese Taipei', 'Hong Kong, China']].sum(axis=1)
oecd_inflows['Japanese_inflow'] = oecd_inflows['Japan']
oecd_inflows['Korean_inflow'] = oecd_inflows[['Korea', 'Democratic People\'s Republic of Korea']].sum(axis=1)
oecd_inflows['Vietnamese_inflow'] = oecd_inflows['Viet Nam']

In [17]:
oecd_inflows.head()

Country of birth/nationality,Year,Bangladesh,China,Chinese Taipei,Democratic People's Republic of Korea,"Hong Kong, China",India,Japan,Korea,Pakistan,Viet Nam,Asian Indian_inflow,Chinese_inflow,Japanese_inflow,Korean_inflow,Vietnamese_inflow
0,1980,532.0,27651.0,1000.0,,3860.0,22607.0,4225.0,32320.0,4265.0,43483.0,27404.0,32511.0,4225.0,32320.0,43483.0
1,1981,756.0,25803.0,1000.0,,4055.0,21522.0,3896.0,32663.0,5288.0,55631.0,27566.0,30858.0,3896.0,32663.0,55631.0
2,1982,639.0,27100.0,9884.0,,4971.0,21738.0,3903.0,31724.0,4536.0,72553.0,26913.0,41955.0,3903.0,31724.0,72553.0
3,1983,787.0,25777.0,16698.0,,5948.0,25451.0,4092.0,33339.0,4807.0,37560.0,31045.0,48423.0,4092.0,33339.0,37560.0
4,1984,823.0,23363.0,12478.0,,5465.0,24964.0,4043.0,33042.0,5509.0,37236.0,31296.0,41306.0,4043.0,33042.0,37236.0


# input the 1980, 90census stocks (from SDA tables interactive online)

In [18]:
stock80 = {'Asian Indian_stock': 395620,
          'Chinese_stock': 813620,
          'Japanese_stock': 720480,
          'Korean_stock': 362760,
          'Vietnamese_stock': 253260}

stock90 = {'Asian Indian_stock': 781376,
           'Bangladeshi_stock': 10214,
           'Pakistani_stock': 78185,
          'Chinese_stock': 1573701,
          'Taiwanese_stock':74587,
          'Japanese_stock': 865319,
          'Korean_stock': 795913,
          'Vietnamese_stock': 587011}

In [19]:
stocks = pd.DataFrame([stock80, stock90])
stocks['Year'] = [1980, 1990]
stocks['Asian Indian_stock'] = stocks[['Asian Indian_stock', 'Bangladeshi_stock', 'Pakistani_stock']].sum(axis=1)
stocks['Chinese_stock'] = stocks[['Chinese_stock', 'Taiwanese_stock']].sum(axis=1)
stocks.drop(['Bangladeshi_stock', 'Pakistani_stock', 'Taiwanese_stock'], axis=1, inplace=True)
stocks

Unnamed: 0,Asian Indian_stock,Chinese_stock,Japanese_stock,Korean_stock,Vietnamese_stock,Year
0,395620.0,813620.0,720480,362760,253260,1980
1,869775.0,1648288.0,865319,795913,587011,1990


# Constructing 1980 predicted flows

In [47]:
n = len(df_1980_flows)
subset = stocks[stocks['Year']==1980][['Asian Indian_stock', 'Chinese_stock', 'Japanese_stock', 'Korean_stock', 'Vietnamese_stock']]
df_1980_flows[['Asian Indian_stock', 'Chinese_stock', 'Japanese_stock', 'Korean_stock', 'Vietnamese_stock']] = pd.DataFrame(np.repeat(subset.values, n, axis=0))
df_1980_flows.head()

Unnamed: 0,SCHOOLID,Asian Indian,Chinese,Japanese,Korean,Vietnamese,NAME,Asian Indian_stock,Chinese_stock,Japanese_stock,Korean_stock,Vietnamese_stock,Asian Indian_share,Chinese_share,Japanese_share,Korean_share,Vietnamese_share
0,600001,3.0,12.0,15.0,6.0,2.0,Acton-Agua Dulce Unified School District,395620.0,813620.0,720480.0,362760.0,253260.0,8e-06,1.5e-05,2.1e-05,1.7e-05,8e-06
1,600006,31.0,109.0,102.0,38.0,8.0,Ross Valley Elementary School District,395620.0,813620.0,720480.0,362760.0,253260.0,7.8e-05,0.000134,0.000142,0.000105,3.2e-05
2,600009,0.0,1.0,2.0,0.0,0.0,Cuyama Joint Unified School District,395620.0,813620.0,720480.0,362760.0,253260.0,0.0,1e-06,3e-06,0.0,0.0
3,600013,2.0,21.0,90.0,9.0,10.0,Rocklin Unified School District,395620.0,813620.0,720480.0,362760.0,253260.0,5e-06,2.6e-05,0.000125,2.5e-05,3.9e-05
4,600014,5.0,19.0,41.0,10.0,11.0,Hesperia Unified School District,395620.0,813620.0,720480.0,362760.0,253260.0,1.3e-05,2.3e-05,5.7e-05,2.8e-05,4.3e-05


In [48]:
# Compute shares
for col in ['Asian Indian', 'Chinese', 'Japanese', 'Korean', 'Vietnamese']:
    df_1980_flows[col + '_share'] = df_1980_flows[col] / df_1980_flows[col + '_stock']

In [49]:
df_1980_flows.head()

Unnamed: 0,SCHOOLID,Asian Indian,Chinese,Japanese,Korean,Vietnamese,NAME,Asian Indian_stock,Chinese_stock,Japanese_stock,Korean_stock,Vietnamese_stock,Asian Indian_share,Chinese_share,Japanese_share,Korean_share,Vietnamese_share
0,600001,3.0,12.0,15.0,6.0,2.0,Acton-Agua Dulce Unified School District,395620.0,813620.0,720480.0,362760.0,253260.0,8e-06,1.5e-05,2.1e-05,1.7e-05,8e-06
1,600006,31.0,109.0,102.0,38.0,8.0,Ross Valley Elementary School District,395620.0,813620.0,720480.0,362760.0,253260.0,7.8e-05,0.000134,0.000142,0.000105,3.2e-05
2,600009,0.0,1.0,2.0,0.0,0.0,Cuyama Joint Unified School District,395620.0,813620.0,720480.0,362760.0,253260.0,0.0,1e-06,3e-06,0.0,0.0
3,600013,2.0,21.0,90.0,9.0,10.0,Rocklin Unified School District,395620.0,813620.0,720480.0,362760.0,253260.0,5e-06,2.6e-05,0.000125,2.5e-05,3.9e-05
4,600014,5.0,19.0,41.0,10.0,11.0,Hesperia Unified School District,395620.0,813620.0,720480.0,362760.0,253260.0,1.3e-05,2.3e-05,5.7e-05,2.8e-05,4.3e-05


In [50]:
# Now compute a new year x school district dataset of predicted flows

# For each school district NAME, use the school district share and compute the predicted flows

# df_1980_flows x oecd_inflows length dataframe

In [51]:
# create year variable for flow2, then merge 
flow1 = oecd_inflows[['Year', 'Asian Indian_inflow', 'Chinese_inflow', 'Japanese_inflow', 'Korean_inflow', 'Vietnamese_inflow']]
flow2 = df_1980_flows[['SCHOOLID', 'NAME', 'Asian Indian_share', 'Chinese_share', 'Japanese_share', 'Korean_share', 'Vietnamese_share']]

shares = flow2.append([flow2] * (oecd_inflows['Year'].nunique() - 1), ignore_index=True).sort_values('SCHOOLID').reset_index().drop('index', axis=1)
shares['Year'] = pd.Series(np.tile(np.arange(1980, 2017), len(df_1980_flows)))
shares.head()

Unnamed: 0,SCHOOLID,NAME,Asian Indian_share,Chinese_share,Japanese_share,Korean_share,Vietnamese_share,Year
0,600001,Acton-Agua Dulce Unified School District,8e-06,1.5e-05,2.1e-05,1.7e-05,8e-06,1980
1,600001,Acton-Agua Dulce Unified School District,8e-06,1.5e-05,2.1e-05,1.7e-05,8e-06,1981
2,600001,Acton-Agua Dulce Unified School District,8e-06,1.5e-05,2.1e-05,1.7e-05,8e-06,1982
3,600001,Acton-Agua Dulce Unified School District,8e-06,1.5e-05,2.1e-05,1.7e-05,8e-06,1983
4,600001,Acton-Agua Dulce Unified School District,8e-06,1.5e-05,2.1e-05,1.7e-05,8e-06,1984


In [52]:
# Merge
predicted_flows80 = shares.merge(flow1, on='Year', how='left')
predicted_flows80.head()

Unnamed: 0,SCHOOLID,NAME,Asian Indian_share,Chinese_share,Japanese_share,Korean_share,Vietnamese_share,Year,Asian Indian_inflow,Chinese_inflow,Japanese_inflow,Korean_inflow,Vietnamese_inflow
0,600001,Acton-Agua Dulce Unified School District,8e-06,1.5e-05,2.1e-05,1.7e-05,8e-06,1980,27404.0,32511.0,4225.0,32320.0,43483.0
1,600001,Acton-Agua Dulce Unified School District,8e-06,1.5e-05,2.1e-05,1.7e-05,8e-06,1981,27566.0,30858.0,3896.0,32663.0,55631.0
2,600001,Acton-Agua Dulce Unified School District,8e-06,1.5e-05,2.1e-05,1.7e-05,8e-06,1982,26913.0,41955.0,3903.0,31724.0,72553.0
3,600001,Acton-Agua Dulce Unified School District,8e-06,1.5e-05,2.1e-05,1.7e-05,8e-06,1983,31045.0,48423.0,4092.0,33339.0,37560.0
4,600001,Acton-Agua Dulce Unified School District,8e-06,1.5e-05,2.1e-05,1.7e-05,8e-06,1984,31296.0,41306.0,4043.0,33042.0,37236.0


In [53]:
# Compute predicted flows
for col in ['Asian Indian', 'Chinese', 'Japanese', 'Korean', 'Vietnamese']:
    predicted_flows80['hat_' + col + '_t-(t-1)'] = predicted_flows80[col + '_share'] * predicted_flows80[col + '_inflow']

In [54]:
predicted_flows80.columns

Index(['SCHOOLID', 'NAME', 'Asian Indian_share', 'Chinese_share',
       'Japanese_share', 'Korean_share', 'Vietnamese_share', 'Year',
       'Asian Indian_inflow', 'Chinese_inflow', 'Japanese_inflow',
       'Korean_inflow', 'Vietnamese_inflow', 'hat_Asian Indian_t-(t-1)',
       'hat_Chinese_t-(t-1)', 'hat_Japanese_t-(t-1)', 'hat_Korean_t-(t-1)',
       'hat_Vietnamese_t-(t-1)'],
      dtype='object')

In [55]:
predicted_flows80['hat_Asian_t-(t-1)'] = predicted_flows80[['hat_Asian Indian_t-(t-1)', 'hat_Chinese_t-(t-1)',
       'hat_Japanese_t-(t-1)', 'hat_Korean_t-(t-1)',
       'hat_Vietnamese_t-(t-1)']].sum(axis=1)

In [56]:
predicted_flows80.head()

Unnamed: 0,SCHOOLID,NAME,Asian Indian_share,Chinese_share,Japanese_share,Korean_share,Vietnamese_share,Year,Asian Indian_inflow,Chinese_inflow,Japanese_inflow,Korean_inflow,Vietnamese_inflow,hat_Asian Indian_t-(t-1),hat_Chinese_t-(t-1),hat_Japanese_t-(t-1),hat_Korean_t-(t-1),hat_Vietnamese_t-(t-1),hat_Asian_t-(t-1)
0,600001,Acton-Agua Dulce Unified School District,8e-06,1.5e-05,2.1e-05,1.7e-05,8e-06,1980,27404.0,32511.0,4225.0,32320.0,43483.0,0.207805,0.479501,0.087962,0.534568,0.343386,1.653224
1,600001,Acton-Agua Dulce Unified School District,8e-06,1.5e-05,2.1e-05,1.7e-05,8e-06,1981,27566.0,30858.0,3896.0,32663.0,55631.0,0.209034,0.455122,0.081113,0.540241,0.439319,1.724829
2,600001,Acton-Agua Dulce Unified School District,8e-06,1.5e-05,2.1e-05,1.7e-05,8e-06,1982,26913.0,41955.0,3903.0,31724.0,72553.0,0.204082,0.61879,0.081258,0.524711,0.572953,2.001794
3,600001,Acton-Agua Dulce Unified School District,8e-06,1.5e-05,2.1e-05,1.7e-05,8e-06,1983,31045.0,48423.0,4092.0,33339.0,37560.0,0.235415,0.714186,0.085193,0.551422,0.296612,1.882829
4,600001,Acton-Agua Dulce Unified School District,8e-06,1.5e-05,2.1e-05,1.7e-05,8e-06,1984,31296.0,41306.0,4043.0,33042.0,37236.0,0.237319,0.609218,0.084173,0.54651,0.294054,1.771273


In [57]:
predicted_flows80.to_csv('/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/output/predicted_flows80.csv', index=False)

# Constructing 1990 predicted flows

In [58]:
n = len(df_1990_flows)
subset = stocks[stocks['Year']==1990][['Asian Indian_stock', 'Chinese_stock', 'Japanese_stock', 'Korean_stock', 'Vietnamese_stock']]
df_1990_flows[['Asian Indian_stock', 'Chinese_stock', 'Japanese_stock', 'Korean_stock', 'Vietnamese_stock']] = pd.DataFrame(np.repeat(subset.values, n, axis=0))
df_1990_flows.head()

Unnamed: 0,SCHOOLID,Asian Indian,Chinese,Japanese,Korean,Vietnamese,NAME,Asian Indian_stock,Chinese_stock,Japanese_stock,Korean_stock,Vietnamese_stock,Asian Indian_share,Chinese_share,Japanese_share,Korean_share,Vietnamese_share
0,600001,28,33,43,35,10,Acton-Agua Dulce Unified School District,869775.0,1648288.0,865319.0,795913.0,587011.0,3.2e-05,2.002077e-05,5e-05,4.4e-05,1.7e-05
1,600006,50,140,127,55,18,Ross Valley Elementary School District,869775.0,1648288.0,865319.0,795913.0,587011.0,5.7e-05,8.493661e-05,0.000147,6.9e-05,3.1e-05
2,600009,0,1,2,0,0,Cuyama Joint Unified School District,869775.0,1648288.0,865319.0,795913.0,587011.0,0.0,6.066901e-07,2e-06,0.0,0.0
3,600011,0,4,1,4,1,Fort Sage Unified School District,869775.0,1648288.0,865319.0,795913.0,587011.0,0.0,2.42676e-06,1e-06,5e-06,2e-06
4,600012,1,3,13,2,0,Twin Ridges Elementary School District,869775.0,1648288.0,865319.0,795913.0,587011.0,1e-06,1.82007e-06,1.5e-05,3e-06,0.0


In [59]:
# Compute shares
for col in ['Asian Indian', 'Chinese', 'Japanese', 'Korean', 'Vietnamese']:
    df_1990_flows[col + '_share'] = df_1990_flows[col] / df_1990_flows[col + '_stock']

In [60]:
df_1990_flows.head()

Unnamed: 0,SCHOOLID,Asian Indian,Chinese,Japanese,Korean,Vietnamese,NAME,Asian Indian_stock,Chinese_stock,Japanese_stock,Korean_stock,Vietnamese_stock,Asian Indian_share,Chinese_share,Japanese_share,Korean_share,Vietnamese_share
0,600001,28,33,43,35,10,Acton-Agua Dulce Unified School District,869775.0,1648288.0,865319.0,795913.0,587011.0,3.2e-05,2.002077e-05,5e-05,4.4e-05,1.7e-05
1,600006,50,140,127,55,18,Ross Valley Elementary School District,869775.0,1648288.0,865319.0,795913.0,587011.0,5.7e-05,8.493661e-05,0.000147,6.9e-05,3.1e-05
2,600009,0,1,2,0,0,Cuyama Joint Unified School District,869775.0,1648288.0,865319.0,795913.0,587011.0,0.0,6.066901e-07,2e-06,0.0,0.0
3,600011,0,4,1,4,1,Fort Sage Unified School District,869775.0,1648288.0,865319.0,795913.0,587011.0,0.0,2.42676e-06,1e-06,5e-06,2e-06
4,600012,1,3,13,2,0,Twin Ridges Elementary School District,869775.0,1648288.0,865319.0,795913.0,587011.0,1e-06,1.82007e-06,1.5e-05,3e-06,0.0


In [61]:
# create year variable for flow2, then merge 
flow1 = oecd_inflows[['Year', 'Asian Indian_inflow', 'Chinese_inflow', 'Japanese_inflow', 'Korean_inflow', 'Vietnamese_inflow']]
flow1 = flow1[flow1['Year']>=1990].copy()
flow2 = df_1990_flows[['SCHOOLID', 'NAME', 'Asian Indian_share', 'Chinese_share', 'Japanese_share', 'Korean_share', 'Vietnamese_share']]

shares = flow2.append([flow2] * (len(np.arange(1990, 2017)) - 1), ignore_index=True).sort_values('SCHOOLID').reset_index().drop('index', axis=1)
shares['Year'] = pd.Series(np.tile(np.arange(1990, 2017), len(df_1990_flows)))
shares.head()

Unnamed: 0,SCHOOLID,NAME,Asian Indian_share,Chinese_share,Japanese_share,Korean_share,Vietnamese_share,Year
0,600001,Acton-Agua Dulce Unified School District,3.2e-05,2e-05,5e-05,4.4e-05,1.7e-05,1990
1,600001,Acton-Agua Dulce Unified School District,3.2e-05,2e-05,5e-05,4.4e-05,1.7e-05,1991
2,600001,Acton-Agua Dulce Unified School District,3.2e-05,2e-05,5e-05,4.4e-05,1.7e-05,1992
3,600001,Acton-Agua Dulce Unified School District,3.2e-05,2e-05,5e-05,4.4e-05,1.7e-05,1993
4,600001,Acton-Agua Dulce Unified School District,3.2e-05,2e-05,5e-05,4.4e-05,1.7e-05,1994


In [62]:
# Merge
predicted_flows90 = shares.merge(flow1, on='Year', how='left')
predicted_flows90.head()

Unnamed: 0,SCHOOLID,NAME,Asian Indian_share,Chinese_share,Japanese_share,Korean_share,Vietnamese_share,Year,Asian Indian_inflow,Chinese_inflow,Japanese_inflow,Korean_inflow,Vietnamese_inflow
0,600001,Acton-Agua Dulce Unified School District,3.2e-05,2e-05,5e-05,4.4e-05,1.7e-05,1990,44648.0,56359.0,5734.0,32301.0,48792.0
1,600001,Acton-Agua Dulce Unified School District,3.2e-05,2e-05,5e-05,4.4e-05,1.7e-05,1991,76095.0,56726.0,5049.0,26518.0,55307.0
2,600001,Acton-Agua Dulce Unified School District,3.2e-05,2e-05,5e-05,4.4e-05,1.7e-05,1992,50709.0,65703.0,11028.0,19359.0,77735.0
3,600001,Acton-Agua Dulce Unified School District,3.2e-05,2e-05,5e-05,4.4e-05,1.7e-05,1993,52339.0,89068.0,6908.0,18026.0,59614.0
4,600001,Acton-Agua Dulce Unified School District,3.2e-05,2e-05,5e-05,4.4e-05,1.7e-05,1994,47053.0,71748.0,6093.0,16011.0,41345.0


In [63]:
# Compute predicted flows
for col in ['Asian Indian', 'Chinese', 'Japanese', 'Korean', 'Vietnamese']:
    predicted_flows90['hat_' + col + '_t-(t-1)'] = predicted_flows90[col + '_share'] * predicted_flows90[col + '_inflow']

In [64]:
# aggregate into Asians
predicted_flows90['hat_Asian_t-(t-1)'] = predicted_flows90[['hat_Asian Indian_t-(t-1)', 'hat_Chinese_t-(t-1)',
       'hat_Japanese_t-(t-1)', 'hat_Korean_t-(t-1)',
       'hat_Vietnamese_t-(t-1)']].sum(axis=1)

In [65]:
predicted_flows90.head()

Unnamed: 0,SCHOOLID,NAME,Asian Indian_share,Chinese_share,Japanese_share,Korean_share,Vietnamese_share,Year,Asian Indian_inflow,Chinese_inflow,Japanese_inflow,Korean_inflow,Vietnamese_inflow,hat_Asian Indian_t-(t-1),hat_Chinese_t-(t-1),hat_Japanese_t-(t-1),hat_Korean_t-(t-1),hat_Vietnamese_t-(t-1),hat_Asian_t-(t-1)
0,600001,Acton-Agua Dulce Unified School District,3.2e-05,2e-05,5e-05,4.4e-05,1.7e-05,1990,44648.0,56359.0,5734.0,32301.0,48792.0,1.437319,1.128351,0.284938,1.420425,0.831194,5.102227
1,600001,Acton-Agua Dulce Unified School District,3.2e-05,2e-05,5e-05,4.4e-05,1.7e-05,1991,76095.0,56726.0,5049.0,26518.0,55307.0,2.449668,1.135698,0.250898,1.16612,0.94218,5.944564
2,600001,Acton-Agua Dulce Unified School District,3.2e-05,2e-05,5e-05,4.4e-05,1.7e-05,1992,50709.0,65703.0,11028.0,19359.0,77735.0,1.632436,1.315425,0.548011,0.851305,1.324251,5.671428
3,600001,Acton-Agua Dulce Unified School District,3.2e-05,2e-05,5e-05,4.4e-05,1.7e-05,1993,52339.0,89068.0,6908.0,18026.0,59614.0,1.684909,1.78321,0.343277,0.792687,1.015552,5.619635
4,600001,Acton-Agua Dulce Unified School District,3.2e-05,2e-05,5e-05,4.4e-05,1.7e-05,1994,47053.0,71748.0,6093.0,16011.0,41345.0,1.514741,1.43645,0.302777,0.704078,0.704331,4.662378


In [66]:
predicted_flows90.to_csv('/Users/tsengtammy/Dropbox/3 TT/4. Thesis/build/data/output/predicted_flows90.csv', index=False)