# North Carolina IRS Individual Income Tax Statistics by Zip Code 2016
* ZIP Code data shows selected income and tax items classified by State, ZIP Code, and size of adjusted gross income. 
* Data are based on individual income tax returns filed with the IRS and are available for Tax Years 1998, 2001, and 2004 through 2016.
* This data is aviable at: https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-statistics-zip-code-data-soi
* We suggest mapping IRS Income Tax data for the tax year that covers a majority of a particular school year. 
* For example, you will find the tax data for 2016 in the 2016-2017 school year folder. This is why the files for NCDPI and IRS data may appear one year off.  
* However, the data is keyed by zip code, so users may merge years however they see fit!  

In [1]:
#import required Libraries
import pandas as pd
import numpy as np
import urllib

#**********************************************************************************
# Set the following variables before running this code!!!
#**********************************************************************************

#Location where copies of the raw data files will be downloaded and saved as csv files.
dataDir = 'C:/Users/Jake/Documents/GitHub/EducationDataNC/2017/Raw Datasets/'

#All raw data files are filtered for the year below
taxYear = 2016

### Save Original Copy of the Data

In [2]:
#Download and save an original copy of the raw data 
#North Carolina - SOI Tax Stats - Individual Income Tax Statistics - 2016 ZIP Code Data (SOI)
urlFile = urllib.URLopener()
url="https://www.irs.gov/pub/irs-soi/16zp34nc.xls"
urlFile.retrieve(url, dataDir + '16zp34nc.xls')

AttributeError: module 'urllib' has no attribute 'URLopener'

### Clean up the Column Names

In [3]:
#Read in the locally saved file for all futher processing 
path= dataDir + '16zp34nc.xls'
incomeTaxData = pd.read_excel(path, header=[0,1], skiprows=3, skipfooter=17, index_col=None) #, dtype={'unit_code': object})

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/Jake/Documents/GitHub/EducationDataNC/2017/Raw Datasets/16zp34nc.xls'

In [102]:
#Combine multiple index column into single index column
incomeTaxData.columns = [' '.join(col).strip() for col in incomeTaxData.columns.values]
incomeTaxData.reset_index(inplace=True)
incomeTaxData.rename(columns={'index': 'Zip Code'}, inplace=True)

#Get rid of unnamed values levels from second column index 
incomeTaxData.columns = [ col[ : np.where(col.find("Unnamed:") >= 0, col.find("Unnamed:") , len(col)  )] for col in incomeTaxData.columns.values]

#Clean up and shorten remaining column names 
incomeTaxData.columns = [ col.replace('[2]','')
                             .replace('[3]','')
                             .replace('[4]','')
                             .replace('[5]','')
                             .replace('[6]','')
                             .replace('[7]','')
                             .replace('[8]','')
                             .replace('[9]','')
                             .replace('[10]','')
                             .replace('[11]','')
                             .replace('[12]','')
                             .replace('\r','')
                             .replace('\n','')
                             .replace(' Number of returns',' Ct')
                             .replace('Amount','Amt')
                             .replace('Total','Tot')
                             .replace('Additional','Add')
                             .replace('additional','Add')
                             .replace('miscellaneous','misc')
                             .replace('education','edu')
                             .replace('  ',' ')
                             .replace('Number of volunteer prepared returns Number of volunteer income tax assistance (VITA) prepared returns','(VITA) prepared returns Ct')
                             .replace('Number of volunteer prepared returns Number of tax counseling for the elderly (TCE) prepared returns','(TCE) prepared returns Ct')
                             .replace('Number of volunteer prepared returns Number of volunteer prepared returns withEarned Income Credit','volunteer prepared w Earned Income Credit')
                         for col in incomeTaxData.columns.values]


incomeTaxData.columns = [col.strip() for col in incomeTaxData.columns.values]

### Remove Blank Rows, Non-Zip Code Summary Data, and Masking

In [103]:
#Delete rows with no zip code
incomeTaxData = incomeTaxData[pd.notnull(incomeTaxData['Zip Code'])]
#Delete state-wide totals 
incomeTaxData = incomeTaxData[incomeTaxData['Zip Code'] != 0]
#Delete any zip codes that the IRS obfuscates by placing into the "other" category 99999
incomeTaxData = incomeTaxData[incomeTaxData['Zip Code'] != 99999]
#Convert all masked zip code data ("**") to 0
incomeTaxData = incomeTaxData.replace({"**":0})

### Clean up Adjusted Gross Income Category Names Before Table Pivot

In [104]:
#Turn off the copy waring for the multiple updates we are about to perform
pd.options.mode.chained_assignment = None
#Shorten all "Size of adjusted gross income" field values, since  
#these will become part of the column names after the table pivot
incomeTaxData.rename(columns={'Size of adjusted gross income' : 'AGI'}, inplace=True)
incomeTaxData['AGI'].fillna('All', inplace=True)
incomeTaxData['AGI'][incomeTaxData['AGI'] == '$1 under $25,000'] = 'LT25K'
incomeTaxData['AGI'][incomeTaxData['AGI'] == '$25,000 under $50,000'] = '25KLT50K'
incomeTaxData['AGI'][incomeTaxData['AGI'] == '$50,000 under $75,000'] = '50KLT75K'
incomeTaxData['AGI'][incomeTaxData['AGI'] == '$75,000 under $100,000'] = '75KLT100K'
incomeTaxData['AGI'][incomeTaxData['AGI'] == '$100,000 under $200,000'] = '100KLT200K'
incomeTaxData['AGI'][incomeTaxData['AGI'] == '$200,000 or more'] = 'GE200K'

In [105]:
#Look at column names before pivot 
incomeTaxData.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5068 entries, 9 to 5799
Data columns (total 145 columns):
Zip Code                                                      float64
AGI                                                           object
Number of returns                                             float64
Number of single returns                                      float64
Number of joint returns                                       float64
Number of head of household returns                           float64
Number with paid preparer's signature                         float64
Number of exemptions                                          float64
Number of dependents                                          float64
Number of volunteer prepared returns Tot                      float64
(VITA) prepared returns Ct                                    float64
(TCE) prepared returns Ct                                     float64
volunteer prepared w Earned Income Credit            

In [106]:
#Look at the data before pivot
incomeTaxData 

Unnamed: 0,Zip Code,AGI,Number of returns,Number of single returns,Number of joint returns,Number of head of household returns,Number with paid preparer's signature,Number of exemptions,Number of dependents,Number of volunteer prepared returns Tot,...,Tot tax liability Ct,Tot tax liability Amt,Tot Add Medicare tax Ct,Tot Add Medicare tax Amt,Net investment income tax Ct,Net investment income tax Amt,Tax due at time of filing Ct,Tax due at time of filing Amt,Overpayments refunded Ct,Overpayments refunded Amt
9,27006.0,All,6780.0,2570.0,3620.0,460.0,3830.0,13510.0,3710.0,120.0,...,5560.0,89620.0,280.0,471.0,320.0,630.0,1720.0,10259.0,4560.0,12920.0
10,27006.0,LT25K,1850.0,1380.0,290.0,150.0,880.0,1980.0,380.0,60.0,...,850.0,681.0,0.0,0.0,0.0,0.0,200.0,136.0,1460.0,1977.0
11,27006.0,25KLT50K,1230.0,580.0,440.0,160.0,660.0,2190.0,540.0,60.0,...,1070.0,2819.0,0.0,0.0,0.0,0.0,220.0,327.0,950.0,2262.0
12,27006.0,50KLT75K,960.0,300.0,560.0,70.0,570.0,2060.0,550.0,0.0,...,920.0,5045.0,0.0,0.0,0.0,0.0,280.0,658.0,640.0,1735.0
13,27006.0,75KLT100K,800.0,140.0,610.0,40.0,490.0,1940.0,540.0,0.0,...,790.0,6509.0,0.0,0.0,0.0,0.0,240.0,687.0,520.0,1738.0
14,27006.0,100KLT200K,1360.0,120.0,1200.0,40.0,800.0,3740.0,1190.0,0.0,...,1350.0,23682.0,0.0,0.0,0.0,0.0,510.0,2197.0,770.0,3117.0
15,27006.0,GE200K,580.0,50.0,520.0,0.0,430.0,1600.0,510.0,0.0,...,580.0,50884.0,280.0,471.0,320.0,630.0,270.0,6254.0,220.0,2091.0
17,27007.0,All,890.0,300.0,490.0,80.0,570.0,1930.0,610.0,30.0,...,690.0,4062.0,0.0,0.0,0.0,0.0,150.0,424.0,710.0,1835.0
18,27007.0,LT25K,340.0,190.0,100.0,50.0,210.0,560.0,170.0,30.0,...,170.0,160.0,0.0,0.0,0.0,0.0,30.0,31.0,280.0,556.0
19,27007.0,25KLT50K,200.0,70.0,100.0,30.0,110.0,440.0,140.0,0.0,...,170.0,421.0,0.0,0.0,0.0,0.0,30.0,56.0,170.0,496.0


### Create Table Pivot Dataset and Adjust Pivot Column Names 
* We create a new view of the IRS data consolidating to one record per zip code. 
* New fields are created for each Adjusted Gross Income ("AGI") range.  
* This creates a total of 724 fields duplicating each original field in the dataset one time for each individual 
  adjusted gross income range.
* This data is saved as IncomeTaxDataByZipCode_2016.csv 

In [107]:
#Get a list of all the incomeTaxData columns we want to pivot
c= incomeTaxData.columns.values
valCols = c[(c != 'AGI') & (c != 'Zip Code')]           

#Pivot income tax data using these columns
incomeTaxData = pd.pivot_table(incomeTaxData, values=valCols,index=['Zip Code'],columns=['AGI'])

#Combine multiple index column names into single index column names
incomeTaxData.columns = [' '.join(col).strip() for col in incomeTaxData.columns.values]

#Make our index a column for merges later
incomeTaxData.reset_index(level=0, inplace=True)

#Inspect pivoted income tax field names
incomeTaxData.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 724 entries, 0 to 723
Data columns (total 1002 columns):
Zip Code                                                                 float64
(TCE) prepared returns Ct 100KLT200K                                     float64
(TCE) prepared returns Ct 25KLT50K                                       float64
(TCE) prepared returns Ct 50KLT75K                                       float64
(TCE) prepared returns Ct 75KLT100K                                      float64
(TCE) prepared returns Ct All                                            float64
(TCE) prepared returns Ct GE200K                                         float64
(TCE) prepared returns Ct LT25K                                          float64
(VITA) prepared returns Ct 100KLT200K                                    float64
(VITA) prepared returns Ct 25KLT50K                                      float64
(VITA) prepared returns Ct 50KLT75K                                      float64
(V

## Summarize IRS Tax Data by School District 
**Income tax data counts and amounts are organized by Adjusted Gross Income Ranges within each Zip Code**
* **All** - Income tax data represents the entire zip code 
* **LT25K** - Income tax data represents adjusted gross income from \$1 under \$25,000 within a zip code.
* **25KLT50K** - Income tax data represents adjusted gross income >= \$25,000 and < \$50,000 within a zip code.
* **50KLT75K** - Income tax data represents adjusted gross income >= \$50,000 and < \$75,000 within a zip code.
* **75KLT100K** - Income tax data represents adjusted gross income >= \$75,000 and < \$100,000 within a zip code.
* **100KLT200K** - Income tax data represents adjusted gross income >= \$100,000 and < \$200,000 within a zip code.
* **GE200K** - Income tax data represents adjusted gross income >= \$200,000 within a zip code.

In [108]:
#Inspect pivoted income tax data
incomeTaxData

Unnamed: 0,Zip Code,(TCE) prepared returns Ct 100KLT200K,(TCE) prepared returns Ct 25KLT50K,(TCE) prepared returns Ct 50KLT75K,(TCE) prepared returns Ct 75KLT100K,(TCE) prepared returns Ct All,(TCE) prepared returns Ct GE200K,(TCE) prepared returns Ct LT25K,(VITA) prepared returns Ct 100KLT200K,(VITA) prepared returns Ct 25KLT50K,...,Unemployment compensation Ct All,Unemployment compensation Ct GE200K,Unemployment compensation Ct LT25K,volunteer prepared w Earned Income Credit 100KLT200K,volunteer prepared w Earned Income Credit 25KLT50K,volunteer prepared w Earned Income Credit 50KLT75K,volunteer prepared w Earned Income Credit 75KLT100K,volunteer prepared w Earned Income Credit All,volunteer prepared w Earned Income Credit GE200K,volunteer prepared w Earned Income Credit LT25K
0,27006.0,0.0,30.0,0.0,0.0,60.0,0.0,30.0,0.0,40.0,...,90.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,27007.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,27009.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,27011.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,...,30.0,0.0,30.0,0.0,0.0,0.0,0.0,20.0,0.0,20.0
4,27012.0,0.0,40.0,0.0,0.0,90.0,0.0,50.0,0.0,80.0,...,190.0,0.0,40.0,0.0,0.0,0.0,0.0,30.0,0.0,30.0
5,27013.0,0.0,0.0,0.0,0.0,30.0,0.0,30.0,0.0,30.0,...,80.0,0.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,27016.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,27017.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80.0,...,60.0,0.0,20.0,0.0,0.0,0.0,0.0,40.0,0.0,40.0
8,27018.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,...,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,27019.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
#Save the pivot table raw data to disk
incomeTaxData.to_csv(dataDir + 'IncomeTaxDataByZipCode_' + str(taxYear) + '.csv', sep=',', index=False)