In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 150)

In [2]:
filepaths = []
for year in range(13,19):
    filepath = './data/' + str(year) + 'zpallagi.csv'
    filepaths.append(filepath)

In [12]:
tax_data = {}
for i in range(6):
    tax_data[2013+i] = pd.read_csv(filepaths[i])

In [47]:
for i in range(2013,2019,1):
    print(tax_data[i].shape)

(166740, 114)
(166722, 127)
(166698, 131)
(179796, 147)
(166537, 153)
(165935, 153)


In [61]:
def tax_data_clean(df, verbose = True):
    
    #convert all columns to lower cases:
    df.columns = df.columns.str.lower()

    #convert zipcode to str type and fill in leading '0's
    df['zipcode'] = df.zipcode.astype('str')
    df['zipcode'] = df['zipcode'].apply(lambda x: x.zfill(5))
    if verbose:
        print("convert zipcode to str type and fill in leading '0's")
    
    #convert statefips column to categorical variable
    df['statefips'] = df['statefips'].astype('object')
    
    #only keep the columns are common to all 6 years' data
    columns = ['statefips','state','zipcode','agi_stub','n1','mars1','mars2','mars4','prep',
           'n2','numdep','a00100','n02650','a02650','n00200','a00200','n00300','a00300','n00600',
           'a00600','n00650','a00650','n00700','a00700','n00900','a00900','n01000','a01000',
           'n01700','a01700','schf','n02300','a02300','n02500','a02500','n26270','a26270',
           'n02900','a02900','n03220','a03220','n03300','a03300','n03270','a03270','n03150',
           'a03150','n03210','a03210','n04470','a04470','a00101','n18425','a18425','n18450',
           'a18450','n18500','a18500','n18300','a18300','n19300','a19300','n19700','a19700',
           'n04800','a04800','n05800','a05800','n09600','a09600','n07100','a07100','n07300',
           'a07300','n07180','a07180','n07230','a07230','n07240','a07240','n07220','a07220',
           'n07260','a07260','n09400','a09400','n10600','a10600','n59660','a59660','n59720',
            'a59720','n11070','a11070','n10960','a10960','n06500','a06500','n10300','a10300',
            'n85300','a85300','n11901','a11901','n11902','a11902']
    try: 
        df = df[columns]
    except:
        df = df[['statefips','state','zipcode','agi_stub','n1','mars1','mars2','mars4','prep',
           'n2','numdep','a00100','n02650','a02650','n00200','a00200','n00300','a00300','n00600',
           'a00600','n00650','a00650','n00700','a00700','n00900','a00900','n01000','a01000',
           'n01750','a01750','schf','n02300','a02300','n02500','a02500','n26270','a26270',
           'n02900','a02900','n03220','a03220','n03300','a03300','n03270','a03270','n03150',
            'a03150','n03210','a03210','n04470','a04470','a00101','n18425','a18425','n18450',
            'a18450','n18500','a18500','n18300','a18300','n19300','a19300','n19700','a19700',
            'n04800','a04800','n05800','a05800','n09600','a09600','n07100','a07100','n07300',
            'a07300','n07180','a07180','n07230','a07230','n07240','a07240','n07225','a07225',
            'n07260','a07260','n09400','a09400','n10600','a10600','n59660','a59660','n59720',
            'a59720','n11070','a11070','n10960','a10960','n06500','a06500','n10300','a10300',
            'n85300','a85300','n11901','a11901','n11902','a11902']]
        
    if verbose:
        print("only keep the columns are common to all 6 years' data")
    
    #remove state level obs (00000) and zipcode were categorized as “other” (99999).
    df = df.loc[(df.zipcode != '00000') & (df.zipcode != '99999'),]
    if verbose:
        print("remove state level obs and zipcodes that were categorized as 'other'.")

    return df

In [59]:
tax_data[2013][['state','zipcode']][~tax_data[2013][['state','zipcode']].duplicated()]

Unnamed: 0,state,zipcode
0,AL,00000
6,AL,35004
12,AL,35005
18,AL,35006
24,AL,35007
...,...,...
166710,WY,83126
166716,WY,83127
166722,WY,83128
166728,WY,83414


In [60]:
def agi_conversion(df, verbose = True):
    
    #retrive # of high income and low income returns
    df_agi = df[['zipcode','agi_stub','n1']].pivot(index='zipcode',columns='agi_stub',values='n1')
    df_agi['high_income'] = df_agi.iloc[:,4] + df_agi.iloc[:,5]
    df_agi['low_income'] = df_agi.iloc[:,0]
    agi = df_agi.reset_index()[['zipcode','high_income','low_income']]
    
    #combine the high income/ low income info back to main dataframe
    df_byzip = df.groupby('zipcode').sum().reset_index()
    df_byzip = df_byzip.merge(agi, on='zipcode',how='inner')
    df_byzip = df_byzip[df_byzip.columns[df_byzip.columns != 'agi_stub']]
    
    #combine the state info back to the dataset
    df_state = df[['state','zipcode']][~df[['state','zipcode']].duplicated()]
    df_byzip = df_byzip.merge(df_state, on='zipcode',how='left')
    
    if verbose:
        print('retrive and combine the high income/ low income info back to main dataframe')

    return df_byzip
    

In [62]:
def normalization(df, verbose = True):
    
    #create lists of columns need to be normalized by # of return or amount of return
    count_col = []
    amount_col = []
    for col in df.columns:
        if col in ['n1','a00100','a02650']:
            pass
        elif col.startswith('n',0,1):
            count_col.append(col)
        elif col.startswith('a',0,1):
            amount_col.append(col)
    count_col = ['mars1','mars2','mars4','prep','schf','high_income','low_income'] + count_col 

    #normalize the number of retruns columns by dividing total number of returns
    for col in count_col:
        df[col] = df[col]/df['n1']
    if verbose:
        print('normalize the number of retruns columns by dividing total number of returns')

    #normalize the retrun amount columns by dividing total return amount
    for col in amount_col:
        df[col] = df[col]/df['a02650']
    if verbose:
        print('normalize the retrun amount columns by dividing total return amount')

    return df

In [63]:
def standardization(df, verbose = True):
    
    #create a column for average income amount for all returns
    df['avg_income'] = df['a02650']/df['n1']
    
    #convert 'n1','a00100','a02650' into standardized features
    for col in ['n1','a00100','a02650','avg_income']:
        df[col] = (df[col] - df[col].mean())/df[col].std()
    if verbose:
        print("convert 'n1','a00100','a02650','avg_income' into standardized features")
        
    return df

In [64]:
tax_model_data = {}

for i in range(2013,2019,1):
    print(f'year {i}')
    df = tax_data_clean(tax_data[i])
    df = agi_conversion(df)
    df = normalization(df)
    df = standardization(df)
    df['year'] = i
    df['year'] = df['year'].astype('object')
    tax_model_data[i] = df

year 2013
convert zipcode to str type and fill in leading '0's
only keep the columns are common to all 6 years' data
remove state level obs and zipcodes that were categorized as 'other'.
retrive and combine the high income/ low income info back to main dataframe
normalize the number of retruns columns by dividing total number of returns
normalize the retrun amount columns by dividing total return amount
convert 'n1','a00100','a02650','avg_income' into standardized features
year 2014
convert zipcode to str type and fill in leading '0's
only keep the columns are common to all 6 years' data
remove state level obs and zipcodes that were categorized as 'other'.
retrive and combine the high income/ low income info back to main dataframe
normalize the number of retruns columns by dividing total number of returns
normalize the retrun amount columns by dividing total return amount
convert 'n1','a00100','a02650','avg_income' into standardized features
year 2015
convert zipcode to str type and fi

In [65]:
for i in range(2013,2019,1):
    print(tax_model_data[i].shape)

(27688, 108)
(27685, 108)
(27681, 108)
(29872, 108)
(27658, 108)
(27556, 108)


In [66]:
tax_analysis_data = {}

for i in range(2013,2019,1):
    print(f'year {i}')
    df = tax_data_clean(tax_data[i])
    df = agi_conversion(df)
    df = normalization(df)
    df['avg_income'] = df['a02650']/df['n1']
    df['year'] = i
    df['year'] = df['year'].astype('object')
    tax_analysis_data[i] = df

year 2013
convert zipcode to str type and fill in leading '0's
only keep the columns are common to all 6 years' data
remove state level obs and zipcodes that were categorized as 'other'.
retrive and combine the high income/ low income info back to main dataframe
normalize the number of retruns columns by dividing total number of returns
normalize the retrun amount columns by dividing total return amount
year 2014
convert zipcode to str type and fill in leading '0's
only keep the columns are common to all 6 years' data
remove state level obs and zipcodes that were categorized as 'other'.
retrive and combine the high income/ low income info back to main dataframe
normalize the number of retruns columns by dividing total number of returns
normalize the retrun amount columns by dividing total return amount
year 2015
convert zipcode to str type and fill in leading '0's
only keep the columns are common to all 6 years' data
remove state level obs and zipcodes that were categorized as 'other'.

In [67]:
for i in range(2013,2019,1):
    print(tax_analysis_data[i].shape)

(27688, 108)
(27685, 108)
(27681, 108)
(29872, 108)
(27658, 108)
(27556, 108)


In [68]:
tax_analysis_data[2013].head()

Unnamed: 0,zipcode,n1,mars1,mars2,mars4,prep,n2,numdep,a00100,n02650,a02650,n00200,a00200,n00300,a00300,n00600,a00600,n00650,a00650,n00700,a00700,n00900,a00900,n01000,a01000,n01700,a01700,schf,n02300,a02300,n02500,a02500,n26270,a26270,n02900,a02900,n03220,a03220,n03300,a03300,n03270,a03270,n03150,a03150,n03210,a03210,n04470,a04470,a00101,n18425,a18425,n18450,a18450,n18500,a18500,n18300,a18300,n19300,a19300,n19700,a19700,n04800,a04800,n05800,a05800,n09600,a09600,n07100,a07100,n07300,a07300,n07180,a07180,n07230,a07230,n07240,a07240,n07220,a07220,n07260,a07260,n09400,a09400,n10600,a10600,n59660,a59660,n59720,a59720,n11070,a11070,n10960,a10960,n06500,a06500,n10300,a10300,n85300,a85300,n11901,a11901,n11902,a11902,high_income,low_income,state,avg_income,year
0,1001,8780.0,0.541002,0.345103,0.099089,0.566059,1.724374,0.455581,471936.0,1.0,477601.0,0.832574,0.751554,0.384966,0.00679,0.218679,0.010852,0.205011,0.007538,0.243736,0.002642,0.115034,0.027305,0.153759,0.011193,0.238041,0.089769,0.0,0.085421,0.011516,0.157175,0.033402,0.038724,0.018484,0.248292,0.011859,0.036446,0.000165,0.003417,0.000984,0.020501,0.002182,0.017084,0.001397,0.095672,0.001803,0.363326,0.116522,0.58274,0.335991,0.028237,0.02164,0.000258,0.335991,0.02084,0.363326,0.051043,0.290433,0.034133,0.287016,0.010528,0.817768,0.672078,0.812073,0.109688,0.010251,0.00072,0.300683,0.005825,0.047836,6.9e-05,0.038724,0.000354,0.075171,0.001621,0.045558,0.000151,0.140091,0.003264,0.030752,0.000134,0.096811,0.004786,0.953303,0.135561,0.113895,0.003989,0.093394,0.003319,0.061503,0.00147,0.06492,0.001114,0.764237,0.103865,0.799544,0.109824,0.005695,0.000165,0.169704,0.009104,0.787016,0.033622,0.142369,0.331435,MA,54.396469,2013
1,1002,9570.0,0.537095,0.368861,0.08046,0.496343,1.733542,0.46395,744429.0,1.0,762298.0,0.801463,0.568528,0.494253,0.011135,0.352142,0.042777,0.329154,0.031602,0.207941,0.002174,0.222571,0.06081,0.3093,0.059815,0.219436,0.104601,0.00418,0.042842,0.003677,0.159875,0.028001,0.073145,0.05193,0.338558,0.023442,0.047022,0.000146,0.024033,0.006467,0.051202,0.004957,0.029258,0.001837,0.098224,0.00121,0.415883,0.14253,0.752525,0.375131,0.0392,0.034483,0.000298,0.384535,0.034294,0.412748,0.075125,0.276907,0.025173,0.356322,0.020489,0.803553,0.712334,0.787879,0.140545,0.060606,0.005107,0.37722,0.004854,0.134796,0.00075,0.047022,0.000273,0.111808,0.001275,0.038662,9.7e-05,0.101358,0.001493,0.036573,0.000531,0.181818,0.008989,0.941484,0.161226,0.112853,0.002384,0.092999,0.002082,0.059561,0.000918,0.0721,0.000861,0.728318,0.135162,0.76907,0.146751,0.040752,0.001772,0.236155,0.014338,0.685475,0.021026,0.242424,0.370951,MA,79.654963,2013
2,1005,2230.0,0.470852,0.439462,0.085202,0.55157,1.914798,0.556054,127991.0,1.0,129645.0,0.878924,0.761657,0.403587,0.0055,0.188341,0.011686,0.170404,0.006063,0.246637,0.002252,0.147982,0.041537,0.147982,0.010961,0.206278,0.072452,0.008969,0.112108,0.013907,0.130045,0.024906,0.035874,0.006695,0.29148,0.012735,0.053812,0.000247,0.0,0.0,0.017937,0.002291,0.013453,0.001365,0.107623,0.00189,0.381166,0.124679,0.609696,0.345291,0.028856,0.017937,0.000216,0.358744,0.022693,0.376682,0.053677,0.32287,0.04203,0.286996,0.008739,0.843049,0.657935,0.838565,0.103236,0.008969,0.000802,0.327354,0.007096,0.035874,4.6e-05,0.035874,0.000393,0.076233,0.001597,0.044843,0.000116,0.165919,0.004065,0.040359,0.000239,0.125561,0.006317,0.964126,0.125095,0.103139,0.003378,0.089686,0.002854,0.058296,0.00128,0.06278,0.001049,0.793722,0.096093,0.820628,0.103891,0.0,0.0,0.192825,0.012434,0.775785,0.032304,0.156951,0.309417,MA,58.136771,2013
3,1007,7300.0,0.458904,0.449315,0.078082,0.521918,1.926027,0.576712,481692.0,1.0,489020.0,0.867123,0.769952,0.49726,0.005513,0.228767,0.009427,0.212329,0.006497,0.293151,0.002509,0.157534,0.034156,0.172603,0.020778,0.210959,0.076015,0.006849,0.073973,0.008049,0.116438,0.019609,0.050685,0.030093,0.30411,0.014991,0.053425,0.0002,0.008219,0.002182,0.031507,0.003114,0.026027,0.001669,0.121918,0.001894,0.438356,0.131068,0.707366,0.415068,0.035099,0.015068,0.000166,0.421918,0.029508,0.435616,0.066772,0.382192,0.041976,0.350685,0.00873,0.836986,0.687974,0.828767,0.117394,0.027397,0.001348,0.357534,0.006427,0.057534,6.7e-05,0.061644,0.000485,0.093151,0.001691,0.039726,0.000104,0.165753,0.003303,0.046575,0.000325,0.128767,0.006057,0.963014,0.139649,0.094521,0.002528,0.076712,0.002104,0.052055,0.000883,0.073973,0.00108,0.784932,0.110963,0.817808,0.118277,0.015068,0.000352,0.176712,0.010169,0.783562,0.029416,0.227397,0.309589,MA,66.989041,2013
4,1008,640.0,0.4375,0.5,0.0625,0.578125,1.875,0.453125,38311.0,1.0,38867.0,0.859375,0.74706,0.390625,0.00301,0.234375,0.00831,0.234375,0.005738,0.28125,0.002753,0.1875,0.027864,0.15625,0.010626,0.265625,0.08897,0.03125,0.109375,0.01217,0.171875,0.029948,0.03125,0.008671,0.25,0.012453,0.0,0.0,0.0,0.0,0.03125,0.004091,0.0,0.0,0.0625,0.001081,0.390625,0.120488,0.616127,0.375,0.030669,0.0,0.0,0.375,0.024725,0.390625,0.057427,0.34375,0.038953,0.296875,0.006509,0.84375,0.674248,0.84375,0.108807,0.0,0.0,0.3125,0.006381,0.0,0.0,0.046875,0.000257,0.09375,0.001518,0.03125,0.000129,0.15625,0.003319,0.046875,0.000154,0.125,0.004914,0.953125,0.129287,0.0625,0.002341,0.0625,0.001801,0.046875,0.000823,0.0625,0.000875,0.796875,0.102375,0.84375,0.108884,0.0,0.0,0.1875,0.008465,0.75,0.030154,0.15625,0.28125,MA,60.729687,2013


## For data analysis, we select a few features to focus on.

There are many taxable income subcategories, to limit the resources on important feature, we will only look at the subcategories appear in more than 20% of all tax return files. We will also include porportion of high income house hold since we know this is a defining feature of local population. And we will exclude 'prep' since that doesn't provide much information. 

In [70]:
count_col = []
amount_col = []
for col in tax_analysis_data[2013].columns:
    if col in ['n1','a00100','a02650']:
        pass
    elif col.startswith('n',0,1):
        count_col.append(col)
    elif col.startswith('a',0,1):
        amount_col.append(col)
cols = ['state','zipcode','mars1','mars2','mars4','schf','high_income',\
        'low_income','avg_income','year'] + count_col

In [71]:
sample = tax_analysis_data[2013]
sample = sample[cols]

In [45]:
sample.mean()[sample.mean() > 0.2]

n2        1.996783
numdep    0.642989
n02650    0.999993
n00200    0.822665
n00300    0.313218
n01700    0.203440
n02900    0.252343
n04470    0.254983
n18500    0.215952
n18300    0.253080
n04800    0.755663
n05800    0.747848
n07100    0.302497
n10600    0.940110
n06500    0.671802
n10300    0.734289
n11902    0.761767
dtype: float64

In [42]:
sample = sample.rename(columns={'mars1': 'num_single',         
                       'mars2': 'num_joint',               
                       'n2':'num_exemption',
                       'numdep': 'num_dependent',
                       'n00200': 'num_salary',
                       'n00300': 'num_w_interests',
                       'n01700': 'num_pensions',
                       'n02900': 'num_statutory',
                       'n04470': 'num_item_dedct',
                       'n18500': 'num_realestate',
                       'n18300': 'num_taxpaid',
                       'a04800': 'taxable_income_amt'})
sample.head()

Unnamed: 0,num_single,num_joint,mars4,schf,high_income,low_income,avg_income,year,num_exemption,num_dependent,n02650,num_salary,num_w_interests,n00600,n00650,n00700,n00900,n01000,num_pensions,n02300,n02500,n26270,num_statutory,n03220,n03300,n03270,n03150,n03210,num_item_dedct,n18425,n18450,num_realestate,num_taxpaid,n19300,n19700,n04800,n05800,n09600,n07100,n07300,n07180,n07230,n07240,n07220,n07260,n09400,n10600,n59660,n59720,n11070,n10960,n06500,n10300,n85300,n11901,n11902
0,0.541002,0.345103,0.099089,0.0,0.142369,0.331435,54.396469,2013,1.724374,0.455581,1.0,0.832574,0.384966,0.218679,0.205011,0.243736,0.115034,0.153759,0.238041,0.085421,0.157175,0.038724,0.248292,0.036446,0.003417,0.020501,0.017084,0.095672,0.363326,0.335991,0.02164,0.335991,0.363326,0.290433,0.287016,0.817768,0.812073,0.010251,0.300683,0.047836,0.038724,0.075171,0.045558,0.140091,0.030752,0.096811,0.953303,0.113895,0.093394,0.061503,0.06492,0.764237,0.799544,0.005695,0.169704,0.787016
1,0.537095,0.368861,0.08046,0.00418,0.242424,0.370951,79.654963,2013,1.733542,0.46395,1.0,0.801463,0.494253,0.352142,0.329154,0.207941,0.222571,0.3093,0.219436,0.042842,0.159875,0.073145,0.338558,0.047022,0.024033,0.051202,0.029258,0.098224,0.415883,0.375131,0.034483,0.384535,0.412748,0.276907,0.356322,0.803553,0.787879,0.060606,0.37722,0.134796,0.047022,0.111808,0.038662,0.101358,0.036573,0.181818,0.941484,0.112853,0.092999,0.059561,0.0721,0.728318,0.76907,0.040752,0.236155,0.685475
2,0.470852,0.439462,0.085202,0.008969,0.156951,0.309417,58.136771,2013,1.914798,0.556054,1.0,0.878924,0.403587,0.188341,0.170404,0.246637,0.147982,0.147982,0.206278,0.112108,0.130045,0.035874,0.29148,0.053812,0.0,0.017937,0.013453,0.107623,0.381166,0.345291,0.017937,0.358744,0.376682,0.32287,0.286996,0.843049,0.838565,0.008969,0.327354,0.035874,0.035874,0.076233,0.044843,0.165919,0.040359,0.125561,0.964126,0.103139,0.089686,0.058296,0.06278,0.793722,0.820628,0.0,0.192825,0.775785
3,0.458904,0.449315,0.078082,0.006849,0.227397,0.309589,66.989041,2013,1.926027,0.576712,1.0,0.867123,0.49726,0.228767,0.212329,0.293151,0.157534,0.172603,0.210959,0.073973,0.116438,0.050685,0.30411,0.053425,0.008219,0.031507,0.026027,0.121918,0.438356,0.415068,0.015068,0.421918,0.435616,0.382192,0.350685,0.836986,0.828767,0.027397,0.357534,0.057534,0.061644,0.093151,0.039726,0.165753,0.046575,0.128767,0.963014,0.094521,0.076712,0.052055,0.073973,0.784932,0.817808,0.015068,0.176712,0.783562
4,0.4375,0.5,0.0625,0.03125,0.15625,0.28125,60.729687,2013,1.875,0.453125,1.0,0.859375,0.390625,0.234375,0.234375,0.28125,0.1875,0.15625,0.265625,0.109375,0.171875,0.03125,0.25,0.0,0.0,0.03125,0.0,0.0625,0.390625,0.375,0.0,0.375,0.390625,0.34375,0.296875,0.84375,0.84375,0.0,0.3125,0.0,0.046875,0.09375,0.03125,0.15625,0.046875,0.125,0.953125,0.0625,0.0625,0.046875,0.0625,0.796875,0.84375,0.0,0.1875,0.75
