In [37]:
# import
import pandas as pd
import numpy as np
# added dependencies
import matplotlib.pyplot as plt
import os
from scipy.stats import linregress
import datetime

In [38]:
hate_crimeDF = pd.read_csv('assets/data/hate_crime.csv')
hate_crimeDF.head()

Unnamed: 0,INCIDENT_ID,DATA_YEAR,ORI,PUB_AGENCY_NAME,PUB_AGENCY_UNIT,AGENCY_TYPE_NAME,STATE_ABBR,STATE_NAME,DIVISION_NAME,REGION_NAME,...,OFFENDER_RACE,OFFENDER_ETHNICITY,VICTIM_COUNT,OFFENSE_NAME,TOTAL_INDIVIDUAL_VICTIMS,LOCATION_NAME,BIAS_DESC,VICTIM_TYPES,MULTIPLE_OFFENSE,MULTIPLE_BIAS
0,3015,1991,AR0040200,Rogers,,City,AR,Arkansas,West South Central,South,...,White,,1,Intimidation,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-Black or African American,Individual,S,S
1,3016,1991,AR0290100,Hope,,City,AR,Arkansas,West South Central,South,...,Black or African American,,1,Simple Assault,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-White,Individual,S,S
2,43,1991,AR0350100,Pine Bluff,,City,AR,Arkansas,West South Central,South,...,Black or African American,,1,Aggravated Assault,1.0,Residence/Home,Anti-Black or African American,Individual,S,S
3,44,1991,AR0350100,Pine Bluff,,City,AR,Arkansas,West South Central,South,...,Black or African American,,2,Aggravated Assault;Destruction/Damage/Vandalis...,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-White,Individual,M,S
4,3017,1991,AR0350100,Pine Bluff,,City,AR,Arkansas,West South Central,South,...,Black or African American,,1,Aggravated Assault,1.0,Service/Gas Station,Anti-White,Individual,S,S


In [39]:
#dropping unnecessary rows
# added 'BIAS_DESC' back into the dataframe
to_drop = ['ORI',
            'PUB_AGENCY_UNIT',
            'DIVISION_NAME',
            'OFFENDER_RACE',
            'OFFENDER_ETHNICITY',
            'VICTIM_COUNT',
            'OFFENSE_NAME',
            'LOCATION_NAME',
            'VICTIM_TYPES',
            'MULTIPLE_OFFENSE',
            'MULTIPLE_BIAS',
            'POPULATION_GROUP_CODE',
            'POPULATION_GROUP_DESC',
            'ADULT_VICTIM_COUNT',
            'JUVENILE_VICTIM_COUNT',
            'TOTAL_OFFENDER_COUNT',
            'ADULT_OFFENDER_COUNT',
            'JUVENILE_OFFENDER_COUNT']
hate_crimeDF.drop(to_drop, inplace=True, axis=1)
hate_crimeDF.head()

Unnamed: 0,INCIDENT_ID,DATA_YEAR,PUB_AGENCY_NAME,AGENCY_TYPE_NAME,STATE_ABBR,STATE_NAME,REGION_NAME,INCIDENT_DATE,TOTAL_INDIVIDUAL_VICTIMS,BIAS_DESC
0,3015,1991,Rogers,City,AR,Arkansas,South,31-AUG-91,1.0,Anti-Black or African American
1,3016,1991,Hope,City,AR,Arkansas,South,19-SEP-91,1.0,Anti-White
2,43,1991,Pine Bluff,City,AR,Arkansas,South,04-JUL-91,1.0,Anti-Black or African American
3,44,1991,Pine Bluff,City,AR,Arkansas,South,24-DEC-91,1.0,Anti-White
4,3017,1991,Pine Bluff,City,AR,Arkansas,South,23-DEC-91,1.0,Anti-White


In [40]:
# Checking the data types of all columns
##hate_crimeDF.dtypes

In [41]:
# Checking the whether the the row contains null values
##hate_crimeDF.isna().sum()

In [42]:
#removing null values
hate_crimeDF.dropna(axis=0, how='any', thresh=None, subset=['TOTAL_INDIVIDUAL_VICTIMS'], inplace=True)
hate_crimeDF.isna().sum()

INCIDENT_ID                 0
DATA_YEAR                   0
PUB_AGENCY_NAME             0
AGENCY_TYPE_NAME            0
STATE_ABBR                  0
STATE_NAME                  0
REGION_NAME                 0
INCIDENT_DATE               0
TOTAL_INDIVIDUAL_VICTIMS    0
BIAS_DESC                   0
dtype: int64

In [43]:
hate_crimeDF.head()

Unnamed: 0,INCIDENT_ID,DATA_YEAR,PUB_AGENCY_NAME,AGENCY_TYPE_NAME,STATE_ABBR,STATE_NAME,REGION_NAME,INCIDENT_DATE,TOTAL_INDIVIDUAL_VICTIMS,BIAS_DESC
0,3015,1991,Rogers,City,AR,Arkansas,South,31-AUG-91,1.0,Anti-Black or African American
1,3016,1991,Hope,City,AR,Arkansas,South,19-SEP-91,1.0,Anti-White
2,43,1991,Pine Bluff,City,AR,Arkansas,South,04-JUL-91,1.0,Anti-Black or African American
3,44,1991,Pine Bluff,City,AR,Arkansas,South,24-DEC-91,1.0,Anti-White
4,3017,1991,Pine Bluff,City,AR,Arkansas,South,23-DEC-91,1.0,Anti-White


In [44]:
# Converting all the column heading to lower case 
hate_crimeDF = hate_crimeDF.rename(str.lower,axis='columns')
hate_crimeDF.head()

Unnamed: 0,incident_id,data_year,pub_agency_name,agency_type_name,state_abbr,state_name,region_name,incident_date,total_individual_victims,bias_desc
0,3015,1991,Rogers,City,AR,Arkansas,South,31-AUG-91,1.0,Anti-Black or African American
1,3016,1991,Hope,City,AR,Arkansas,South,19-SEP-91,1.0,Anti-White
2,43,1991,Pine Bluff,City,AR,Arkansas,South,04-JUL-91,1.0,Anti-Black or African American
3,44,1991,Pine Bluff,City,AR,Arkansas,South,24-DEC-91,1.0,Anti-White
4,3017,1991,Pine Bluff,City,AR,Arkansas,South,23-DEC-91,1.0,Anti-White


In [45]:
# removing years that are not in consideration of our study
hate_crimeDF = hate_crimeDF[hate_crimeDF['data_year'] >= 1993]
print(hate_crimeDF.head())
print(hate_crimeDF.shape)

       incident_id  data_year pub_agency_name agency_type_name state_abbr  \
11256        15761       1993       Anchorage             City         AK   
11257        15762       1993       Anchorage             City         AK   
11258        15763       1993       Anchorage             City         AK   
11259        15764       1993       Anchorage             City         AK   
11260        15765       1993       Anchorage             City         AK   

      state_name region_name incident_date  total_individual_victims  \
11256     Alaska        West     14-FEB-93                       2.0   
11257     Alaska        West     24-MAR-93                       2.0   
11258     Alaska        West     02-JUL-93                       1.0   
11259     Alaska        West     07-JUL-93                       1.0   
11260     Alaska        West     12-JUL-93                       1.0   

                            bias_desc  
11256                 Anti-Gay (Male)  
11257  Anti-Black or Afr

In [46]:
#saving cleaned data csv
hate_crimeDF.to_csv('assets/data/cleaned_hate_crime.csv',index = False)

In [47]:
# Creating CSV for the data specific to the bar graph
bar_graph_df = hate_crimeDF.groupby('data_year').bias_desc.count().to_frame(name = 'num_of_crimes').reset_index()
bar_graph_df

Unnamed: 0,data_year,num_of_crimes
0,1993,7608
1,1994,5954
2,1995,7950
3,1996,8790
4,1997,8107
5,1998,7902
6,1999,7943
7,2000,8219
8,2001,9730
9,2002,7485


In [48]:
bar_graph_df.to_csv('assets/data/dataFrame_Bar.csv',index = False)