# Live Coding Exercise: Data Manipulation Using pandas

Created: 03/22/23 by Tom Lever

Updated: 03/22/23 by Tom Lever

In [1]:
import numpy as np
import os
import pandas as pd
os.chdir('.')

In [3]:
courts = pd.read_csv('data100k.csv')
courts.head(1).T

Unnamed: 0,0
person_id,102090000000110
HearingDate,2019-02-28
CodeSection,A.46.2-862
codesection,covered elsewhere
ChargeType,Misdemeanor
chargetype,Misdemeanor
Class,1
DispositionCode,Guilty
disposition,Conviction
Plea,


In [4]:
courts['Race'].value_counts()

White Caucasian(Non-Hispanic)                  114421
Black(Non-Hispanic)                             80173
White Caucasian (Non-Hispanic)                  41679
Black (Non-Hispanic)                            33254
Hispanic                                         9319
White                                            3527
Other(Includes Not Applicable.. Unknown)         3452
Asian Or Pacific Islander                        2787
Black                                            2200
MISSING                                          1022
Unknown (Includes Not Applicable.. Unknown)       785
Other (Includes Not Applicable.. Unknown)         615
American Indian                                   302
Unknown                                            54
Asian or Pacific Islander                           7
American Indian Or Alaskan Native                   1
Name: Race, dtype: int64

In [5]:
courts['Race'].unique()

array(['Black(Non-Hispanic)', 'Hispanic', 'White Caucasian(Non-Hispanic)',
       'MISSING', 'Asian Or Pacific Islander', 'Black (Non-Hispanic)',
       'White Caucasian (Non-Hispanic)',
       'Other(Includes Not Applicable.. Unknown)',
       'Other (Includes Not Applicable.. Unknown)', 'Black', 'White',
       'Unknown (Includes Not Applicable.. Unknown)', 'American Indian',
       'Unknown', 'Asian or Pacific Islander',
       'American Indian Or Alaskan Native'], dtype=object)

In [6]:
race_map = {
    'Black(Non-Hispanic)':'Black',
    'Hispanic':'Hispanic',
    'White Caucasian(Non-Hispanic)':'White',
    'MISSING':'Other',
    'Asian Or Pacific Islander':'Asian or Pacific Islander',
    'Black (Non-Hispanic)':'Black',
    'White Caucasian (Non-Hispanic)':'White',
    'Other(Includes Not Applicable.. Unknown)':'Other',
    'Other (Includes Not Applicable.. Unknown)':'Other',
    'Black':'Black',
    'White':'White',
    'Unknown (Includes Not Applicable.. Unknown)':'Other',
    'American Indian':'American Indian or Alaskan Native',
    'Unknown':'Other',
    'Asian or Pacific Islander':'Asian or Pacific Islander',
    'American Indian Or Alaskan Native':'American Indian or Alaskan Native'
}
courts['Race'] = courts['Race'].replace(race_map)

In [7]:
courts['Race'].value_counts()

White                                159627
Black                                115627
Hispanic                               9319
Other                                  5928
Asian or Pacific Islander              2794
American Indian or Alaskan Native       303
Name: Race, dtype: int64

In [16]:
race_codesection = courts.groupby(['CodeSection', 'Race']).size().reset_index()
race_codesection = race_codesection.rename({0:'Count'}, axis = 1)
race_codesection

Unnamed: 0,CodeSection,Race,Count
0,(74-4) 26-123,Black,1
1,01-2007,White,1
2,1,Black,5
3,1,White,3
4,1-12,Black,62
...,...,...,...
6635,Z.18.2-91,White,166
6636,Z.18.2-91; 26,Black,1
6637,Z.18.2-92,Black,1
6638,Z.18.2-95,Black,2


In [19]:
race_codesection_pivot = race_codesection.pivot_table(index = 'CodeSection', columns = 'Race', values = 'Count', fill_value = 0)
race_codesection_pivot

Race,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Other,White
CodeSection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
(74-4) 26-123,0,0,1,0,0,0
01-2007,0,0,0,0,0,1
1,0,0,5,0,0,3
1-12,0,0,62,0,0,13
1-200,0,0,26,0,1,17
...,...,...,...,...,...,...
Z.18.2-91,0,3,131,2,0,166
Z.18.2-91; 26,0,0,1,0,0,0
Z.18.2-92,0,0,1,0,0,0
Z.18.2-95,0,0,2,0,0,0


In [20]:
race_codesection_pivot = race_codesection_pivot.assign(total = race_codesection_pivot['American Indian or Alaskan Native'] + race_codesection_pivot['Asian or Pacific Islander'] + race_codesection_pivot['Black'] + race_codesection_pivot['Hispanic'] + race_codesection_pivot['Other'] + race_codesection_pivot['White'])
race_codesection_pivot

Race,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Other,White,total
CodeSection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
(74-4) 26-123,0,0,1,0,0,0,1
01-2007,0,0,0,0,0,1,1
1,0,0,5,0,0,3,8
1-12,0,0,62,0,0,13,75
1-200,0,0,26,0,1,17,44
...,...,...,...,...,...,...,...
Z.18.2-91,0,3,131,2,0,166,302
Z.18.2-91; 26,0,0,1,0,0,0,1
Z.18.2-92,0,0,1,0,0,0,1
Z.18.2-95,0,0,2,0,0,0,2


In [23]:
race_codesection_pivot = race_codesection_pivot.assign(disparity = (race_codesection_pivot['Black'] - race_codesection_pivot['White'])/race_codesection_pivot['total'])
race_codesection_pivot

Race,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Other,White,total,disparity
CodeSection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
(74-4) 26-123,0,0,1,0,0,0,1,1.000000
01-2007,0,0,0,0,0,1,1,-1.000000
1,0,0,5,0,0,3,8,0.250000
1-12,0,0,62,0,0,13,75,0.653333
1-200,0,0,26,0,1,17,44,0.204545
...,...,...,...,...,...,...,...,...
Z.18.2-91,0,3,131,2,0,166,302,-0.115894
Z.18.2-91; 26,0,0,1,0,0,0,1,1.000000
Z.18.2-92,0,0,1,0,0,0,1,1.000000
Z.18.2-95,0,0,2,0,0,0,2,1.000000


In [26]:
race_codesection_pivot = race_codesection_pivot.query('total >= 200')
race_codesection_pivot

Race,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Other,White,total,disparity
CodeSection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
16.1-253.2,0,0,56,1,0,152,209,-0.459330
17-7,0,10,43,0,0,151,204,-0.529412
18.2-102,0,1,129,2,1,127,260,0.007692
18.2-103,1,26,1295,31,17,2164,3534,-0.245897
18.2-104,0,2,266,1,7,349,625,-0.132800
...,...,...,...,...,...,...,...,...
C.46.2-894,2,4,91,2,6,216,321,-0.389408
C.46.2-896,0,5,99,3,4,273,384,-0.453125
MISSING,0,0,129,5,8,104,246,0.101626
NO DMV,0,4,175,16,6,202,403,-0.066998


In [27]:
race_codesection_pivot.sort_values(by = 'disparity', ascending = False)

Race,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Other,White,total,disparity
CodeSection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
29-48,0,1,228,0,1,30,260,0.761538
18.2-53.1,0,5,1411,16,5,309,1746,0.631157
46.2-938,0,1,319,7,2,69,398,0.628141
18.2-58,0,6,1142,18,10,400,1576,0.470812
18.2-32,0,1,197,3,1,74,276,0.445652
...,...,...,...,...,...,...,...,...
4.1-305,3,26,444,41,29,2245,2788,-0.645983
54.1-3466,0,4,100,2,2,709,817,-0.745410
18.2-374.1:1,0,0,60,1,25,583,669,-0.781764
18.2-258.1,0,1,83,2,13,767,866,-0.789838
