# Concatenate all individual CDR reports to CDR Reports All
#### There's also some cleaning of variables at the bottom of the code here that should be added to clean_cdr when folks think it looks ok

In [330]:
import pygsheets
import json
import numpy as np
import pandas as pd
import re

In [331]:
pd.set_option('display.max_rows', 1000)

### Authorize pygsheets

In [332]:
gc = pygsheets.authorize(service_file='/Users/hlukas/Downloads/client_secret_service.json')

### List spreadsheets

In [333]:
gc.drive.enable_team_drive('0ACeQWapAwOLqUk9PVA')
titles = gc.spreadsheet_titles()
ids = gc.spreadsheet_ids()

### Pull Eva's current sheet for reference

In [334]:
all_data_id = [ids[index] for index in range(len(titles)) if titles[index] == 'CDR Reports All'][0]
output = gc.open_by_key(all_data_id)
wk = output[1]
all_data = wk.get_as_df(include_tailing_empty = True).drop([''], axis = 1)

### Pull each individual raw data sheet and concatenate them
#### Note that we need to rename the time fields (report and death) that google has separated

In [335]:
dfs = {}
for key in [ids[index] for index in range(len(titles)) if titles[index][:4] == 'CDR_']:
    gc.drive.enable_team_drive('0ACeQWapAwOLqUk9PVA')
    output = gc.open_by_key(key)
    print(output.title)
    wk = output[0]
    df = wk.get_as_df(include_tailing_empty = True) 
    
    new_cols = list(df.columns)
    if '' in new_cols:
        if new_cols.index('') == 4:
            new_cols[4] = 'Report Time'
    
        try:
            new_cols[new_cols.index('')] = 'Death Time'
        except:
            pass
        
    df.columns = new_cols
    
    dfs[key] = df

    print(sum([x == '' for x in df.columns]), len(df.columns) == len(set(df.columns)))
        

CDR_10-2018
0 True
CDR_6-2020
0 True
CDR_9-2018
0 True
CDR_4-2020
0 True
CDR_7-2018
0 True
CDR_11_2019
0 True
CDR_2-2020
0 True
CDR_6-2018
0 True
CDR_4-2018
0 True
CDR_8-2019
0 True
CDR_3_2020
0 True
CDR_4-2019
0 True
CDR_2-2019
0 True
CDR_12-2018
0 True
CDR_7-2019
0 True
CDR_5-2020
0 True
CDR_5-2018
0 True
CDR_10-2019
0 True
CDR_12_2019
0 True
CDR_5-2019
0 True
CDR_9_2019
0 True
CDR_1-2020
0 True
CDR_6-2019
0 True
CDR_3-2019
0 True


In [336]:
total_df = pd.concat(dfs, ignore_index = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


### Check to see if the observation counts match
#### They don't, at least for now. I think some of the data is inaccessible because they were uploaded as excel maybe?

In [337]:
total_df.drop_duplicates().shape[0] == all_data.drop_duplicates().shape[0]

False

### Make sure that the CDR ID value is consistent so we're not grabbing the bad obs

In [338]:
cleaned_df = total_df.copy()
cleaned_df = cleaned_df.loc[cleaned_df['CDR: CDR Name'].str.match('\d{1,2}\-\d{1,4}\-[A-Z]{1,2}')]
total_df.loc[total_df['CDR: CDR Name'].str.match('\d{1,2}\-\d{1,4}\-[A-Z]{1,2}') == False]['CDR: CDR Name'].value_counts()


                                                                     8
Confidential Information - Do Not Distribute                         4
Texas Attorney General                                               4
CDRS: CURRENT FORM Reports (v05/16)                                  4
Copyright (c) 2000-2018 salesforce.com, inc. All rights reserved.    3
a2Ct0000001aclY                                                      1
Copyright (c) 2000-2019 salesforce.com, inc. All rights reserved.    1
Generated By: Renee Watts 10/9/2018 7:58 AM                          1
PA13680C                                                             1
Generated By: Renee Watts 11/6/2018 7:41 AM                          1
Generated By: Judy Brown 9/4/2019 4:03 PM                            1
Generated By: Renee Watts 7/10/2018 8:12 AM                          1
Name: CDR: CDR Name, dtype: int64

### Combine the Death Date and Time fields to be consistent with Eva

In [339]:
cleaned_df['Death Date and Time'] = np.where(pd.isnull(cleaned_df['Death Time']), 
                                             cleaned_df['Death Date and Time'], 
                                             cleaned_df['Death Date and Time'] + ' ' + cleaned_df['Death Time'])

cleaned_df = cleaned_df.drop(['Death Time'], axis = 1)
assert(sum(pd.isnull(cleaned_df['Death Date and Time'])) == 0)

### Make sure our columns match the Eva original

In [340]:
cols_all_data = set(list(all_data.columns))
cols_amal = set(list(cleaned_df.columns))

assert(len(cols_amal.symmetric_difference(cols_all_data)) == 0)

### Upload to our test file

In [353]:
sh = gc.open_by_key('1cbGys017c0YsiOLnwnr_9EzY3Hg_qmZeGNFHgc6rogA')
wks = sh[0]
wks.set_dataframe(cleaned_df, start = 'A1', fit = True)

## Cleaning variables that aren't included in our output currently
### Clean up the decedent fields

In [341]:
print(cleaned_df['Decedent display/use of weapons'].value_counts())
print(cleaned_df['Decedent Display or Use Weapon Details'].value_counts())

No                          1934
Yes, mark all that apply     501
Unknown                      334
                               1
Name: Decedent display/use of weapons, dtype: int64
                                                                                             2266
Discharged firearm                                                                            188
Displayed firearm without discharge                                                           102
Displayed other weapon, specify:                                                               66
Used other weapon, specify:                                                                    47
Displayed firearm without discharge; Discharged firearm                                        35
Displayed other weapon, specify:; Used other weapon, specify:                                  24
Used vehicle as weapon                                                                         24
Displayed firearm without discha

#### Confirm that we see details filled in where they should be

In [342]:
print(cleaned_df.loc[cleaned_df['Decedent display/use of weapons'] != 'Yes, mark all that apply']['Decedent Display or Use Weapon Details'].value_counts())
print(cleaned_df.loc[cleaned_df['Decedent display/use of weapons'] == 'Yes, mark all that apply']['Decedent Display or Use Weapon Details'].value_counts())

                                       2266
Displayed other weapon, specify:          2
Displayed firearm without discharge       1
Name: Decedent Display or Use Weapon Details, dtype: int64
Discharged firearm                                                                           188
Displayed firearm without discharge                                                          101
Displayed other weapon, specify:                                                              64
Used other weapon, specify:                                                                   47
Displayed firearm without discharge; Discharged firearm                                       35
Displayed other weapon, specify:; Used other weapon, specify:                                 24
Used vehicle as weapon                                                                        24
Displayed firearm without discharge; Discharged firearm; Used vehicle as weapon                6
Displayed firearm without dischar

#### Somewhat arbitrarily decide to trust the display/use of weapons field, so coerce the values to match

In [343]:
cleaned_df['Decedent Display or Use Weapon Details'] = np.where(cleaned_df['Decedent display/use of weapons'] != 'Yes, mark all that apply',
                                                                '',
                                                                cleaned_df['Decedent Display or Use Weapon Details'])

print(cleaned_df.loc[cleaned_df['Decedent display/use of weapons'] != 'Yes, mark all that apply']['Decedent Display or Use Weapon Details'].value_counts())

    2269
Name: Decedent Display or Use Weapon Details, dtype: int64


#### We can actually combine these decedent weapon fields, and include the details only where the answer is yes

In [344]:
cleaned_df['Decedent Display or Use Weapon?'] = np.where(cleaned_df['Decedent display/use of weapons'] != 'Yes, mark all that apply',
                                                         cleaned_df['Decedent display/use of weapons'],
                                                         cleaned_df['Decedent Display or Use Weapon Details'])

print(cleaned_df['Decedent Display or Use Weapon?'].value_counts())

No                                                                                           1934
Unknown                                                                                       334
Discharged firearm                                                                            188
Displayed firearm without discharge                                                           101
Displayed other weapon, specify:                                                               64
Used other weapon, specify:                                                                    47
Displayed firearm without discharge; Discharged firearm                                        35
Displayed other weapon, specify:; Used other weapon, specify:                                  24
Used vehicle as weapon                                                                         24
Displayed firearm without discharge; Discharged firearm; Used vehicle as weapon                 6
Displayed firearm wi

#### We can use the 2005 field "Use weapon to threaten/assault officer(s)" or as a proxy for this field for the 2005 data

In [345]:
all_data_id = [ids[index] for index in range(len(titles)) if titles[index] == 'CDR Reports All'][0]
output = gc.open_by_key(all_data_id)
wk = output[0]
all_data_2005 = wk.get_as_df(include_tailing_empty = True)

In [346]:
all_data_2005['Use weapon threaten/assault officer(s)'].value_counts()

0      4523
1       624
Yes       7
No        2
Name: Use weapon threaten/assault officer(s), dtype: int64

In [347]:
all_data_2005['Decedent Display or Use Weapon?'] = np.where(all_data_2005['Use weapon threaten/assault officer(s)'].isin([0, 'No']),
                                                            'No',
                                                            'Yes')
all_data_2005['Decedent Display or Use Weapon?'].value_counts()

No     4525
Yes     631
Name: Decedent Display or Use Weapon?, dtype: int64

#### Correspondingly, we don't need to specify any weapons yet

In [348]:
all_data_2005['Specify Weapon Used'] = np.NaN