In [1]:
import pandas as pd

# Preprocessing the census data for the TJI's Officer-involved Shootings (OIS) in Texas Report

# [1. Census](https://demographics.texas.gov/Data/TPEPP/Estimates/)
Our report makes comparions between the OIS incident datasets and census data. The census data comes from [Texas Demographic Center](https://demographics.texas.gov/). 

>The Texas Demographic Center functions as a focal point for the production, interpretation, and distribution of demographic information for Texas. The Texas Demographic Center produces and disseminates population estimates and projections for Texas, as well as other demographic information. Special emphasis is placed on data that may be useful to policy makers in dealing with issues regarding the demand for State services.

## 1.1 Total Population By County
Source: https://demographics.texas.gov/Resources/TPEPP/Estimates/2018/2018_txpopest_county.csv

Even though the website has annual estimates based on their own methodology, we decided to use the actual census count from 2010 (the column, `cqr_census_2010_count`).

In [2]:
df_census = pd.read_csv('../Data/Raw/Census/2018_txpopest_county.csv')

In [3]:
df_census.head()

Unnamed: 0,FIPS,county,cqr_census_2010_count,july1_2018_pop_est,jan1_2019_pop_est,num_chg_10_18,num_chg_10_19,pct_chg_10_18,pct_chg_10_19
0,1,Anderson,58458,58979,58854,521,396,0.9,0.7
1,3,Andrews,14786,18678,19232,3892,4446,26.3,30.1
2,5,Angelina,86771,91687,92353,4916,5582,5.7,6.4
3,7,Aransas,23158,23724,23031,566,-127,2.4,-0.5
4,9,Archer,9054,9459,9625,405,571,4.5,6.3


In [4]:
df_census.index = df_census['county'].str.upper()
df_pop_county = df_census['cqr_census_2010_count']

In [5]:
df_pop_county.shape[0]

255

In [6]:
df_pop_county.head()

county
ANDERSON    58458
ANDREWS     14786
ANGELINA    86771
ARANSAS     23158
ARCHER       9054
Name: cqr_census_2010_count, dtype: int64

In [7]:
df_pop_county.to_pickle('../Data/Interim/census_county_2010.pkl')

## Age, Sex, and Race/Ethnicity for State and Counties
Source: https://demographics.texas.gov/Resources/TPEPP/Estimates/2018/2018_ASRE_Estimate_alldata_csv.zip

After unzipping, you will find `alldata.csv`.

In [8]:
df_census_detail = pd.read_csv('../Data/Raw/Census/alldata.csv')

In [9]:
df_census_detail.head()

Unnamed: 0,County,FIPS,Age,Total,Total_Male,Total_Female,NH_White_Total,NH_White_Male,NH_White_Female,NH_Black_Total,...,NH_Black_Female,NH_Asian_Total,NH_Asian_Male,NH_Asian_Female,NH_Other_Total,NH_Other_Male,NH_Other_Female,Hispanic_Total,Hispanic_Male,Hispanic_Female
0,STATE OF TEXAS,0,All Ages,28702243,14254981,14447262,12008303,5953354,6054949,3394972,...,1752264,1381899,677090,704809,629148,312300,316848,11287921,5669529,5618392
1,STATE OF TEXAS,0,< 1 Year,389277,198709,190568,114969,58905,56064,46083,...,22639,16580,8469,8111,17102,8778,8324,194543,99113,95430
2,STATE OF TEXAS,0,1 Years,395402,201725,193677,117408,60141,57267,46604,...,22959,16511,8468,8043,17255,8835,8420,197624,100636,96988
3,STATE OF TEXAS,0,2 Years,407861,208076,199785,122987,62939,60048,47786,...,23539,17479,8955,8524,17292,8840,8452,202317,103095,99222
4,STATE OF TEXAS,0,3 Years,417497,213195,204302,127819,65480,62339,48419,...,23920,17931,9220,8711,17230,8795,8435,206098,105201,100897


In [10]:
df_census_detail.columns

Index(['County', 'FIPS', 'Age', 'Total', 'Total_Male', 'Total_Female',
       'NH_White_Total', 'NH_White_Male', 'NH_White_Female', 'NH_Black_Total',
       'NH_Black_Male', 'NH_Black_Female', 'NH_Asian_Total', 'NH_Asian_Male',
       'NH_Asian_Female', 'NH_Other_Total', 'NH_Other_Male', 'NH_Other_Female',
       'Hispanic_Total', 'Hispanic_Male', 'Hispanic_Female'],
      dtype='object')

In [11]:
# remove rows with state of texas
df_census_detail_county = df_census_detail.loc[df_census_detail['County'] != 'STATE OF TEXAS', :]

# keep the 'All Ages' row only
df_census_detail_county = df_census_detail_county.loc[df_census_detail['Age'] == 'All Ages', :]

# rename the county column and make it as index
df_census_detail_county.loc[:, 'County'] = df_census_detail_county['County'].str.replace(' COUNTY', '').values
df_census_detail_county.index = df_census_detail_county['County']
df_census_detail_county.drop('County', axis=1, inplace=True)

# regroup race as white, black, hispanic and others (dropping gender since female is very small proportion)
df_census_race_county = df_census_detail_county[['NH_White_Total', 'NH_Black_Total', 'Hispanic_Total', 'NH_Asian_Total', 'NH_Other_Total']]
df_census_race_county.loc[:, 'OTHER'] = df_census_race_county[['NH_Asian_Total', 'NH_Other_Total']].sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [12]:
df_census_race_county = df_census_race_county.drop(['NH_Asian_Total', 'NH_Other_Total'], axis=1)
df_census_race_county.columns = ['WHITE', 'BLACK', 'HISPANIC', 'OTHER']

In [13]:
df_census_race_county.head()

Unnamed: 0_level_0,WHITE,BLACK,HISPANIC,OTHER
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ANDERSON,34383,12472,10550,1574
ANDREWS,6794,206,11371,307
ANGELINA,55069,13751,20476,2391
ARANSAS,15716,244,6896,868
ARCHER,8238,40,933,248


In [14]:
df_census_race_county.to_pickle('../Data/Interim/census_county_race_2010.pkl')

## Age

In [15]:
df_census_detail_age = df_census_detail.loc[df_census_detail['County'] != 'STATE OF TEXAS', :]
df_census_detail_age.loc[:, 'County'] = df_census_detail_age['County'].str.replace(' COUNTY', '').values
df_census_detail_age.index = df_census_detail_age['County']
df_census_detail_age.drop('County', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [16]:
df_census_detail_age = df_census_detail_age.loc[df_census_detail_age['Age'] != 'All Ages', :]
df_census_detail_age['Age'] = df_census_detail_age['Age'].str.replace('< 1 Year', '0 Years').str.replace('+','').str.replace('Years','').str.strip().astype(int)

In [17]:
df_census_detail_age = df_census_detail_age[['Age', 'NH_White_Total', 'NH_Black_Total', 'Hispanic_Total', 'NH_Asian_Total', 'NH_Other_Total']]
df_census_detail_age.loc[:, 'OTHER'] = df_census_detail_age[['NH_Asian_Total', 'NH_Other_Total']].sum(axis=1)

In [18]:
df_census_detail_age = df_census_detail_age.drop(['NH_Asian_Total', 'NH_Other_Total'], axis=1)
df_census_detail_age.columns = ['AGE', 'WHITE', 'BLACK', 'HISPANIC', 'OTHER']

In [19]:
df_census_detail_age.head()

Unnamed: 0_level_0,AGE,WHITE,BLACK,HISPANIC,OTHER
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ANDERSON,0,279,120,144,62
ANDERSON,1,280,117,137,70
ANDERSON,2,281,117,137,54
ANDERSON,3,284,113,133,56
ANDERSON,4,285,110,143,46


In [20]:
df_census_detail_age.to_pickle('../Data/Interim/census_county_race_age_2010.pkl')