# Live Coding Exercise: Data Manipulation Using pandas

Created: 03/22/23 by Tom Lever

Updated: 03/22/23 by Tom Lever

In [1]:
import numpy as np
import os
import pandas as pd
os.chdir('.')

In [2]:
courts = pd.read_csv('data100k.csv')
courts.head(1).T

Unnamed: 0,0
person_id,102090000000110
HearingDate,2019-02-28
CodeSection,A.46.2-862
codesection,covered elsewhere
ChargeType,Misdemeanor
chargetype,Misdemeanor
Class,1
DispositionCode,Guilty
disposition,Conviction
Plea,


In [3]:
courts['Race'].value_counts()

White Caucasian(Non-Hispanic)                  114421
Black(Non-Hispanic)                             80173
White Caucasian (Non-Hispanic)                  41679
Black (Non-Hispanic)                            33254
Hispanic                                         9319
White                                            3527
Other(Includes Not Applicable.. Unknown)         3452
Asian Or Pacific Islander                        2787
Black                                            2200
MISSING                                          1022
Unknown (Includes Not Applicable.. Unknown)       785
Other (Includes Not Applicable.. Unknown)         615
American Indian                                   302
Unknown                                            54
Asian or Pacific Islander                           7
American Indian Or Alaskan Native                   1
Name: Race, dtype: int64

In [4]:
courts['Race'].unique()

array(['Black(Non-Hispanic)', 'Hispanic', 'White Caucasian(Non-Hispanic)',
       'MISSING', 'Asian Or Pacific Islander', 'Black (Non-Hispanic)',
       'White Caucasian (Non-Hispanic)',
       'Other(Includes Not Applicable.. Unknown)',
       'Other (Includes Not Applicable.. Unknown)', 'Black', 'White',
       'Unknown (Includes Not Applicable.. Unknown)', 'American Indian',
       'Unknown', 'Asian or Pacific Islander',
       'American Indian Or Alaskan Native'], dtype=object)

In [5]:
race_map = {
    'Black(Non-Hispanic)':'Black',
    'Hispanic':'Hispanic',
    'White Caucasian(Non-Hispanic)':'White',
    'MISSING':'Other',
    'Asian Or Pacific Islander':'Asian or Pacific Islander',
    'Black (Non-Hispanic)':'Black',
    'White Caucasian (Non-Hispanic)':'White',
    'Other(Includes Not Applicable.. Unknown)':'Other',
    'Other (Includes Not Applicable.. Unknown)':'Other',
    'Black':'Black',
    'White':'White',
    'Unknown (Includes Not Applicable.. Unknown)':'Other',
    'American Indian':'American Indian or Alaskan Native',
    'Unknown':'Other',
    'Asian or Pacific Islander':'Asian or Pacific Islander',
    'American Indian Or Alaskan Native':'American Indian or Alaskan Native'
}
courts['Race'] = courts['Race'].replace(race_map)

In [6]:
courts['Race'].value_counts()

White                                159627
Black                                115627
Hispanic                               9319
Other                                  5928
Asian or Pacific Islander              2794
American Indian or Alaskan Native       303
Name: Race, dtype: int64

In [7]:
race_codesection = courts.groupby(['CodeSection', 'Race']).size().reset_index()
race_codesection = race_codesection.rename({0:'Count'}, axis = 1)
race_codesection

Unnamed: 0,CodeSection,Race,Count
0,(74-4) 26-123,Black,1
1,01-2007,White,1
2,1,Black,5
3,1,White,3
4,1-12,Black,62
...,...,...,...
6635,Z.18.2-91,White,166
6636,Z.18.2-91; 26,Black,1
6637,Z.18.2-92,Black,1
6638,Z.18.2-95,Black,2


In [8]:
race_codesection_pivot = race_codesection.pivot_table(index = 'CodeSection', columns = 'Race', values = 'Count', fill_value = 0)
race_codesection_pivot

Race,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Other,White
CodeSection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
(74-4) 26-123,0,0,1,0,0,0
01-2007,0,0,0,0,0,1
1,0,0,5,0,0,3
1-12,0,0,62,0,0,13
1-200,0,0,26,0,1,17
...,...,...,...,...,...,...
Z.18.2-91,0,3,131,2,0,166
Z.18.2-91; 26,0,0,1,0,0,0
Z.18.2-92,0,0,1,0,0,0
Z.18.2-95,0,0,2,0,0,0


In [9]:
race_codesection_pivot = race_codesection_pivot.assign(total = race_codesection_pivot['American Indian or Alaskan Native'] + race_codesection_pivot['Asian or Pacific Islander'] + race_codesection_pivot['Black'] + race_codesection_pivot['Hispanic'] + race_codesection_pivot['Other'] + race_codesection_pivot['White'])
race_codesection_pivot

Race,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Other,White,total
CodeSection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
(74-4) 26-123,0,0,1,0,0,0,1
01-2007,0,0,0,0,0,1,1
1,0,0,5,0,0,3,8
1-12,0,0,62,0,0,13,75
1-200,0,0,26,0,1,17,44
...,...,...,...,...,...,...,...
Z.18.2-91,0,3,131,2,0,166,302
Z.18.2-91; 26,0,0,1,0,0,0,1
Z.18.2-92,0,0,1,0,0,0,1
Z.18.2-95,0,0,2,0,0,0,2


In [10]:
race_codesection_pivot = race_codesection_pivot.assign(disparity = (race_codesection_pivot['Black'] - race_codesection_pivot['White'])/race_codesection_pivot['total'])
race_codesection_pivot

Race,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Other,White,total,disparity
CodeSection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
(74-4) 26-123,0,0,1,0,0,0,1,1.000000
01-2007,0,0,0,0,0,1,1,-1.000000
1,0,0,5,0,0,3,8,0.250000
1-12,0,0,62,0,0,13,75,0.653333
1-200,0,0,26,0,1,17,44,0.204545
...,...,...,...,...,...,...,...,...
Z.18.2-91,0,3,131,2,0,166,302,-0.115894
Z.18.2-91; 26,0,0,1,0,0,0,1,1.000000
Z.18.2-92,0,0,1,0,0,0,1,1.000000
Z.18.2-95,0,0,2,0,0,0,2,1.000000


In [11]:
race_codesection_pivot = race_codesection_pivot.query('total >= 200')
race_codesection_pivot

Race,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Other,White,total,disparity
CodeSection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
16.1-253.2,0,0,56,1,0,152,209,-0.459330
17-7,0,10,43,0,0,151,204,-0.529412
18.2-102,0,1,129,2,1,127,260,0.007692
18.2-103,1,26,1295,31,17,2164,3534,-0.245897
18.2-104,0,2,266,1,7,349,625,-0.132800
...,...,...,...,...,...,...,...,...
C.46.2-894,2,4,91,2,6,216,321,-0.389408
C.46.2-896,0,5,99,3,4,273,384,-0.453125
MISSING,0,0,129,5,8,104,246,0.101626
NO DMV,0,4,175,16,6,202,403,-0.066998


In [12]:
race_codesection_pivot.sort_values(by = 'disparity', ascending = False)

Race,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Other,White,total,disparity
CodeSection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
29-48,0,1,228,0,1,30,260,0.761538
18.2-53.1,0,5,1411,16,5,309,1746,0.631157
46.2-938,0,1,319,7,2,69,398,0.628141
18.2-58,0,6,1142,18,10,400,1576,0.470812
18.2-32,0,1,197,3,1,74,276,0.445652
...,...,...,...,...,...,...,...,...
4.1-305,3,26,444,41,29,2245,2788,-0.645983
54.1-3466,0,4,100,2,2,709,817,-0.745410
18.2-374.1:1,0,0,60,1,25,583,669,-0.781764
18.2-258.1,0,1,83,2,13,767,866,-0.789838


# Census Race Data

In [13]:
race_url = 'https://demographics.coopercenter.org/sites/demographics/files/media/files/2020-07/Census_2019_RaceEstimates_forVA_0.xls'
hispanic_url = 'https://demographics.coopercenter.org/sites/demographics/files/media/files/2020-07/Census_2019_HispanicEstimates_forVA_0.xls'
census_race = pd.read_excel(race_url, skiprows = [0, 1, 2, 3, 5, 6, 7])
census_race = census_race[['FIPS', 'Jurisdiction', 'Total Population', 
                           'Unnamed: 4', 
                           'Unnamed: 6', 
                           'Unnamed: 8', 
                           'Unnamed: 10', 
                           'Unnamed: 12']]
census_race = census_race.rename(columns = {
    'Unnamed: 4':'White_pop',
    'Unnamed: 6':'Black_pop',
    'Unnamed: 8':'Asian_pop',
    'Unnamed: 10':'Other_pop',
    'Unnamed: 12':'Twoormore_pop'
})
census_race

Unnamed: 0,FIPS,Jurisdiction,Total Population,White_pop,Black_pop,Asian_pop,Other_pop,Twoormore_pop
0,1,Accomack County,32316,0.677652,0.287907,0.007953,0.009067,0.017422
1,3,Albemarle County,109330,0.817598,0.096954,0.055346,0.004418,0.025684
2,5,Alleghany County,14860,0.927524,0.046972,0.003096,0.003769,0.018641
3,7,Amelia County,13145,0.764549,0.204488,0.006086,0.006466,0.018410
4,9,Amherst County,31605,0.768834,0.191141,0.005695,0.009650,0.024680
...,...,...,...,...,...,...,...,...
128,800,Suffolk city,92108,0.521051,0.426087,0.019043,0.005157,0.028662
129,810,Virginia Beach city,449974,0.673777,0.202592,0.073873,0.006460,0.043298
130,820,Waynesboro city,22630,0.807114,0.134335,0.016880,0.006584,0.035086
131,830,Williamsburg city,14954,0.736458,0.159890,0.060452,0.006955,0.036244


In [14]:
census_hisp = pd.read_excel(hispanic_url, skiprows = 9)
census_hisp = census_hisp[['Unnamed: 0', 'Unnamed: 7']]
census_hisp = census_hisp.rename(columns = {
    'Unnamed: 0':'FIPS',
    'Unnamed: 7':'Hisp_pop'
})
census_hisp

Unnamed: 0,FIPS,Hisp_pop
0,1,0.091441
1,3,0.057743
2,5,0.016016
3,7,0.031799
4,9,0.024268
...,...,...
128,800,0.046684
129,810,0.084972
130,820,0.086876
131,830,0.071486


### First, a full join with diagnostics to see if the merge is okay

In [15]:
census = pd.merge(census_race, census_hisp, on = 'FIPS', how = 'outer', indicator = 'matched', validate = 'one_to_one')
census['matched'].value_counts()

both          133
left_only       0
right_only      0
Name: matched, dtype: int64

### Second, the merge we actually want

In [16]:
census = pd.merge(census_race, census_hisp, on = 'FIPS', how = 'inner')
census

Unnamed: 0,FIPS,Jurisdiction,Total Population,White_pop,Black_pop,Asian_pop,Other_pop,Twoormore_pop,Hisp_pop
0,1,Accomack County,32316,0.677652,0.287907,0.007953,0.009067,0.017422,0.091441
1,3,Albemarle County,109330,0.817598,0.096954,0.055346,0.004418,0.025684,0.057743
2,5,Alleghany County,14860,0.927524,0.046972,0.003096,0.003769,0.018641,0.016016
3,7,Amelia County,13145,0.764549,0.204488,0.006086,0.006466,0.018410,0.031799
4,9,Amherst County,31605,0.768834,0.191141,0.005695,0.009650,0.024680,0.024268
...,...,...,...,...,...,...,...,...,...
128,800,Suffolk city,92108,0.521051,0.426087,0.019043,0.005157,0.028662,0.046684
129,810,Virginia Beach city,449974,0.673777,0.202592,0.073873,0.006460,0.043298,0.084972
130,820,Waynesboro city,22630,0.807114,0.134335,0.016880,0.006584,0.035086,0.086876
131,830,Williamsburg city,14954,0.736458,0.159890,0.060452,0.006955,0.036244,0.071486


# Coming back to the court data: we need data that counts number of cases in each code section/race/FIPS combo

In [17]:
courts_fips = courts.groupby(['CodeSection', 'Race', 'fips']).size().reset_index()
courts_fips = courts_fips.rename(columns = {0: 'count'})
courts_fips

Unnamed: 0,CodeSection,Race,fips,count
0,(74-4) 26-123,Black,550,1
1,01-2007,White,51,1
2,1,Black,550,5
3,1,White,550,2
4,1,White,810,1
...,...,...,...,...
37580,Z.18.2-91; 26,Black,700,1
37581,Z.18.2-92,Black,760,1
37582,Z.18.2-95,Black,67,1
37583,Z.18.2-95,Black,83,1


In [18]:
courts_fips_pivot = courts_fips.pivot_table(index = ['CodeSection', 'fips'], columns = 'Race', values = 'count', fill_value = 0)
courts_fips_pivot

Unnamed: 0_level_0,Race,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Other,White
CodeSection,fips,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
(74-4) 26-123,550,0,0,1,0,0,0
01-2007,51,0,0,0,0,0,1
1,550,0,0,5,0,0,2
1,810,0,0,0,0,0,1
1-12,650,0,0,62,0,0,13
...,...,...,...,...,...,...,...
Z.18.2-91; 26,700,0,0,1,0,0,0
Z.18.2-92,760,0,0,1,0,0,0
Z.18.2-95,67,0,0,1,0,0,0
Z.18.2-95,83,0,0,1,0,0,0


In [19]:
courts_fips_pivot = courts_fips_pivot.assign(
    total =
        courts_fips_pivot['American Indian or Alaskan Native'] +
        courts_fips_pivot['Asian or Pacific Islander'] +
        courts_fips_pivot['Black'] +
        courts_fips_pivot['Hispanic'] +
        courts_fips_pivot['Other'] +
        courts_fips_pivot['White']
)
courts_fips_pivot

Unnamed: 0_level_0,Race,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Other,White,total
CodeSection,fips,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
(74-4) 26-123,550,0,0,1,0,0,0,1
01-2007,51,0,0,0,0,0,1,1
1,550,0,0,5,0,0,2,7
1,810,0,0,0,0,0,1,1
1-12,650,0,0,62,0,0,13,75
...,...,...,...,...,...,...,...,...
Z.18.2-91; 26,700,0,0,1,0,0,0,1
Z.18.2-92,760,0,0,1,0,0,0,1
Z.18.2-95,67,0,0,1,0,0,0,1
Z.18.2-95,83,0,0,1,0,0,0,1


In [20]:
courts_fips_pivot['amerind_percent'] = courts_fips_pivot['American Indian or Alaskan Native'] / courts_fips_pivot['total']
courts_fips_pivot['asian_percent'] = courts_fips_pivot['Asian or Pacific Islander'] / courts_fips_pivot['total']
courts_fips_pivot['black_percent'] = courts_fips_pivot['Black'] / courts_fips_pivot['total']
courts_fips_pivot['white_percent'] = courts_fips_pivot['White'] / courts_fips_pivot['total']
courts_fips_pivot['other_percent'] = courts_fips_pivot['Other'] / courts_fips_pivot['total']
courts_fips_pivot['hisp_percent'] = courts_fips_pivot['Hispanic'] / courts_fips_pivot['total']
courts_fips_pivot = courts_fips_pivot.reset_index()
courts_fips_pivot

Race,CodeSection,fips,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Other,White,total,amerind_percent,asian_percent,black_percent,white_percent,other_percent,hisp_percent
0,(74-4) 26-123,550,0,0,1,0,0,0,1,0.0,0.0,1.000000,0.000000,0.0,0.0
1,01-2007,51,0,0,0,0,0,1,1,0.0,0.0,0.000000,1.000000,0.0,0.0
2,1,550,0,0,5,0,0,2,7,0.0,0.0,0.714286,0.285714,0.0,0.0
3,1,810,0,0,0,0,0,1,1,0.0,0.0,0.000000,1.000000,0.0,0.0
4,1-12,650,0,0,62,0,0,13,75,0.0,0.0,0.826667,0.173333,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25072,Z.18.2-91; 26,700,0,0,1,0,0,0,1,0.0,0.0,1.000000,0.000000,0.0,0.0
25073,Z.18.2-92,760,0,0,1,0,0,0,1,0.0,0.0,1.000000,0.000000,0.0,0.0
25074,Z.18.2-95,67,0,0,1,0,0,0,1,0.0,0.0,1.000000,0.000000,0.0,0.0
25075,Z.18.2-95,83,0,0,1,0,0,0,1,0.0,0.0,1.000000,0.000000,0.0,0.0


## Joining court fips data to census data

In [21]:
courts_census = pd.merge(courts_fips_pivot, census, left_on = 'fips', right_on = 'FIPS', how = 'outer', indicator = 'matched', validate = 'many_to_one')
courts_census['matched'].value_counts()

both          23513
left_only      1564
right_only        8
Name: matched, dtype: int64

In [22]:
postmerge_left = courts_census.query("matched == 'left_only'")
postmerge_right = courts_census.query("matched == 'right_only'")
postmerge_left.head(1).T

Unnamed: 0,4247
CodeSection,1-200
fips,701.0
American Indian or Alaskan Native,0.0
Asian or Pacific Islander,0.0
Black,1.0
Hispanic,0.0
Other,0.0
White,0.0
total,1.0
amerind_percent,0.0


In [23]:
postmerge_right.head(8).T

Unnamed: 0,25077,25078,25079,25080,25081,25082,25083,25084
CodeSection,,,,,,,,
fips,,,,,,,,
American Indian or Alaskan Native,,,,,,,,
Asian or Pacific Islander,,,,,,,,
Black,,,,,,,,
Hispanic,,,,,,,,
Other,,,,,,,,
White,,,,,,,,
total,,,,,,,,
amerind_percent,,,,,,,,


# Let's pretend that these diagnostics were all okay

In [27]:
courts_census = pd.merge(courts_fips_pivot, census, left_on = 'fips', right_on = 'FIPS', how = 'inner')
courts_census

Unnamed: 0,CodeSection,fips,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Other,White,total,amerind_percent,...,hisp_percent,FIPS,Jurisdiction,Total Population,White_pop,Black_pop,Asian_pop,Other_pop,Twoormore_pop,Hisp_pop
0,(74-4) 26-123,550,0,0,1,0,0,0,1,0.0,...,0.0,550,Chesapeake city,244835,0.616039,0.306125,0.034970,0.006225,0.036641,0.066326
1,1,550,0,0,5,0,0,2,7,0.0,...,0.0,550,Chesapeake city,244835,0.616039,0.306125,0.034970,0.006225,0.036641,0.066326
2,1-200,550,0,0,3,0,0,0,3,0.0,...,0.0,550,Chesapeake city,244835,0.616039,0.306125,0.034970,0.006225,0.036641,0.066326
3,10-10,550,0,0,1,0,0,0,1,0.0,...,0.0,550,Chesapeake city,244835,0.616039,0.306125,0.034970,0.006225,0.036641,0.066326
4,10-10 A(7),550,0,0,0,0,0,2,2,0.0,...,0.0,550,Chesapeake city,244835,0.616039,0.306125,0.034970,0.006225,0.036641,0.066326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23508,A.46.2-853,91,0,0,0,0,0,1,1,0.0,...,0.0,91,Highland County,2190,0.976256,0.010959,0.006393,0.003196,0.003196,0.010502
23509,A.46.2-862,91,0,0,0,0,1,4,5,0.0,...,0.0,91,Highland County,2190,0.976256,0.010959,0.006393,0.003196,0.003196,0.010502
23510,B.46.2-301,91,0,0,1,0,0,0,1,0.0,...,0.0,91,Highland County,2190,0.976256,0.010959,0.006393,0.003196,0.003196,0.010502
23511,B.46.2-853,91,0,0,0,1,0,0,1,0.0,...,1.0,91,Highland County,2190,0.976256,0.010959,0.006393,0.003196,0.003196,0.010502


In [28]:
courts_census['black_disparity'] = courts_census['black_percent'] / courts_census['Black_pop']
courts_census.sort_values('black_disparity', ascending = False)[['FIPS', 'Jurisdiction', 'CodeSection', 'black_disparity']]

Unnamed: 0,FIPS,Jurisdiction,CodeSection,black_disparity
21567,45,Craig County,18.2-286,223.086957
21601,45,Craig County,4VAC15-90-231,223.086957
21587,45,Craig County,29.1-520,223.086957
21590,45,Craig County,29.1-530.1,223.086957
21591,45,Craig County,29.1-548,223.086957
...,...,...,...,...
9068,165,Rockingham County,46.2-335C,0.000000
3124,185,Tazewell County,18.2-132,0.000000
1706,11,Appomattox County,18.2-67.1,0.000000
1705,11,Appomattox County,18.2-61,0.000000
