In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
from numpy import timedelta64

In [None]:
dflocation = pd.read_csv('AustinZipCodes.csv')
dfcrime = pd.read_csv('crime-housing-austin-2015.csv')
dflocation = dflocation.rename(columns={'Zip Code': 'Zip_Code_Crime', 'People / Sq. Mile': 'Pop_Density'})
dfcrime = pd.merge(dfcrime, dflocation, on = 'Zip_Code_Crime')

### Analysis-1: Correlation between Crime Count, Unemployment, Population Below Poverty, Income, Population, House Price of Zip Codes

In [None]:
dfcrime_analysis = dfcrime[['Zip_Code_Crime','Unemployment','Populationbelowpovertylevel','Medianhouseholdincome','Populationwithdisability','Medianhomevalue', 'Pop_Density','Population']].copy()
dfcrime_analysis['Unemployment'] = dfcrime_analysis['Unemployment'].str.replace('%', '').astype('float')
dfcrime_analysis['Populationbelowpovertylevel'] = dfcrime_analysis['Populationbelowpovertylevel'].str.replace('%', '').astype('float')
dfcrime_analysis['Medianhouseholdincome'] = dfcrime_analysis['Medianhouseholdincome'].str.replace('$', '').astype('float')
dfcrime_analysis['Populationwithdisability'] = dfcrime_analysis['Populationwithdisability'].str.replace('%', '').astype('float')
dfcrime_analysis['Medianhomevalue'] = dfcrime_analysis['Medianhomevalue'].str.replace('$', '').astype('float')
dfcrime_analysis['Pop_Density'] = dfcrime_analysis['Pop_Density'].str.replace(',', '').astype('float')
dfcrime_analysis['Population'] = dfcrime_analysis['Population'].str.replace(',', '').astype('float')
dfcrime_analysis.head(2)

In [None]:
#Data Cleaning and Calculating Means

dfcrime_analysis_means = dfcrime_analysis.groupby('Zip_Code_Crime').agg({'Zip_Code_Crime':['count'], 'Unemployment':['mean'], 'Populationbelowpovertylevel':['mean'], 'Medianhouseholdincome':['mean'],'Populationwithdisability':['mean'], 'Medianhomevalue':['mean'],'Pop_Density':['mean'], 'Population':['mean'] }).reset_index()
dfcrime_analysis_means.columns = ['Zip', 'Crimes_Reported_Count', 'UnEmp_Mean', 'PopPov_Mean', 'Income_Mean', 'PopDisable_Mean','HouseVal_Mean', 'Pop_Density', 'Population' ]
dfcrime_analysis_means = dfcrime_analysis_means.sort_values(['Crimes_Reported_Count'])
dfcrime_analysis_means = dfcrime_analysis_means.dropna()
dfcrime_analysis_means.head(2)


### Plot for Crime Count and Unemployment Percentage of Zip Codes

In [None]:
plt.figure(figsize=(6,7))
p = sns.regplot(data=dfcrime_analysis_means, x='Crimes_Reported_Count', y='UnEmp_Mean')
p.tick_params(labelsize = 8)
plt.title('Crimes Count and Unemployment Percentage of Zip Codes', fontsize = 15)
plt.xlabel('Number of Crimes Reported', fontsize = 12)
plt.ylabel('Unemployment Percentage', fontsize = 12)
plt.savefig("CrimeCount-Unemployment.pdf", format="pdf", bbox_inches="tight")
plt.show()

### P Value

In [None]:
(r,p) = stats.pearsonr(dfcrime_analysis_means.Crimes_Reported_Count, dfcrime_analysis_means.UnEmp_Mean)
print('r =', r, 'p =', p)

### Plot for Unemployment and Populaton Below Poverty of Zip Codes

In [None]:
plt.figure(figsize=(6,7))
p = sns.regplot(data=dfcrime_analysis_means, x='UnEmp_Mean', y='PopPov_Mean')
p.tick_params(labelsize = 8)
plt.title('Unemployment and Populaton Below Poverty', fontsize = 15)
plt.xlabel('Unemployment Percentage', fontsize = 12)
plt.ylabel('Populaton Below Poverty', fontsize = 12)
plt.savefig("Unemployment-Poverty.pdf", format="pdf", bbox_inches="tight")
plt.show()

### P Value

In [None]:
(r,p) = stats.pearsonr(dfcrime_analysis_means.PopPov_Mean, dfcrime_analysis_means.UnEmp_Mean)
print('r =', r, 'p =', p)

### Plot for Number of Crimes and Population of Zip Codes

In [None]:
plt.figure(figsize=(6,7))
p = sns.regplot(data=dfcrime_analysis_means, x='Crimes_Reported_Count', y='Population')
p.tick_params(labelsize = 8)
plt.title('Number of Crimes and Population', fontsize = 15)
plt.xlabel('Number of Crimes Reported', fontsize = 12)
plt.ylabel('Population', fontsize = 12)
plt.savefig("CrimeCount-Population.pdf", format="pdf", bbox_inches="tight")
plt.show()

### P Value

In [None]:
(r,p) = stats.pearsonr(dfcrime_analysis_means.Crimes_Reported_Count, dfcrime_analysis_means.Population)
print('r =', r, 'p =', p)

### Plot for Number of Crimes and Population Below Poverty Line

In [None]:
plt.figure(figsize=(6,7))
p = sns.regplot(data=dfcrime_analysis_means, x='Crimes_Reported_Count', y='PopPov_Mean')
p.tick_params(labelsize = 8)
plt.title('Number of Crimes and Population Below Poverty Line', fontsize = 15)
plt.xlabel('Number of Crimes Reported', fontsize = 12)
plt.ylabel('Population Below Poverty Line', fontsize = 12)
plt.savefig("CrimeCount-Poverty.pdf", format="pdf", bbox_inches="tight")
plt.show()

### P Value

In [None]:
(r,p) = stats.pearsonr(dfcrime_analysis_means.Crimes_Reported_Count, dfcrime_analysis_means.PopPov_Mean)
print('r =', r, 'p =', p)

### Plot for House Value and Unemployment Percentage

In [None]:
plt.figure(figsize=(6,7))
p = sns.regplot(data=dfcrime_analysis_means, x='HouseVal_Mean', y='UnEmp_Mean')
p.tick_params(labelsize = 8)
plt.title('House Value and Unemployment Percentage', fontsize = 15)
plt.xlabel('House Value', fontsize = 12)
plt.ylabel('Unemployment Percentage', fontsize = 12)
plt.savefig("HouseValue-Unemployment.pdf", format="pdf", bbox_inches="tight")
plt.show()

### P Value

In [None]:
(r,p) = stats.pearsonr(dfcrime_analysis_means.HouseVal_Mean, dfcrime_analysis_means.UnEmp_Mean)
print('r =', r, 'p =', p)

### Analysis-2: Types of Crimes and their Frequency with Crime Status

In [None]:
df_a_4 = dfcrime[['Highest_NIBRS_UCR_Offense_Description']]
df_a_4_g = df_a_4.groupby('Highest_NIBRS_UCR_Offense_Description').agg({'Highest_NIBRS_UCR_Offense_Description':['count']}).reset_index()
df_a_4_g.columns = ['Crime_Type', 'Count' ]
df_a_4_g = df_a_4_g.dropna().sort_values('Count', ascending = False).head()
df_a_4_g.head(2)

In [None]:
df_a_5 = dfcrime[['Highest_NIBRS_UCR_Offense_Description','Clearance_Status']].dropna()
df_a_5_g = df_a_5.groupby(['Highest_NIBRS_UCR_Offense_Description', 'Clearance_Status']).agg({'Clearance_Status':['count'] }).reset_index()
df_a_5_g.columns = ['Type','Status', 'Count' ]
df_a_5_g = df_a_5_g.sort_values('Count', ascending = False)
df_a_5_g.head()

### Plot for Types of Crimes and their status i.e., Cleared (C) , Cleared by Exception (O) , Not Cleared (N)

In [None]:
plt.figure(figsize=(25,6))
p = sns.catplot(x='Type', y='Count', hue='Status', data=df_a_5_g, kind = 'bar', height = 10, aspect = 1 )
plt.title('Types of Crimes and their status i.e., Cleared (C) , Cleared by Exception (O) , Not Cleared (N)', fontsize = 15)
plt.xlabel('Crime Types', fontsize = 12)
plt.ylabel('Number of Crimes', fontsize = 12)
plt.savefig("CrimeandStatus.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
# display(dfcrime.columns)
crimeType = dfcrime[["Highest_Offense_Desc", "Averagemonthlytransportationcost"]].copy()
crimeType['Averagemonthlytransportationcost'] = crimeType['Averagemonthlytransportationcost'].replace("\$",'',regex=True)
crimeType.dropna()
crimeType['Averagemonthlytransportationcost'] = crimeType['Averagemonthlytransportationcost'].astype('float')
crimeType['count'] = 1
crimeType = crimeType.groupby('Highest_Offense_Desc').agg({'Averagemonthlytransportationcost' :'sum','count':'sum'}).reset_index()
crimeType['avgtranscostpercrimetype'] = crimeType['Averagemonthlytransportationcost'] / crimeType['count']
crimeType = crimeType[crimeType['count'] > 250]


In [None]:
crimeType['normtranscost'] = (crimeType['avgtranscostpercrimetype'] - crimeType['avgtranscostpercrimetype'].min())/ (crimeType['avgtranscostpercrimetype'].max() - crimeType['avgtranscostpercrimetype'].min()) 
ax1 = sns.barplot(data=crimeType, x='Highest_Offense_Desc',y='normtranscost')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=40, ha="right")
plt.savefig("Transportation-CrimeCount.pdf", format="pdf", bbox_inches="tight")
crimeType

### Analysis-3: Top 5 Zip Codes with highest number of Crimes of each type

In [None]:
df_a_6 = dfcrime[['Zip_Code_Crime','Highest_NIBRS_UCR_Offense_Description']]
df_a_6_g = df_a_6.groupby(['Zip_Code_Crime','Highest_NIBRS_UCR_Offense_Description']).agg({'Highest_NIBRS_UCR_Offense_Description':['count']}).reset_index()
df_a_6_g.columns = ['Zip','Crime_Type', 'Count' ]
df_a_6_g = df_a_6_g.dropna().sort_values('Count', ascending = False)
df_a_6_g = df_a_6_g.loc[(df_a_6_g['Zip'] == 78753.0) | (df_a_6_g['Zip'] == 78704.0) | (df_a_6_g['Zip'] == 78741.0) | (df_a_6_g['Zip'] == 78758.0) | (df_a_6_g['Zip'] == 78701.0)]
df_a_6_g.head(2)

In [None]:
plt.figure(figsize=(23,6))
p = sns.catplot(x='Zip', y='Count', hue='Crime_Type', data=df_a_6_g, kind = 'bar', height = 10, aspect = 1 )
plt.title('Top 5 Zip Codes with Number of Crime and Types', fontsize = 15)
plt.xlabel('Zip Codes', fontsize = 12)
plt.ylabel('Number of Crimes', fontsize = 12)
plt.savefig("Top5ZipwithCrimCount.pdf", format="pdf", bbox_inches="tight")
plt.show()

### Analysis-4: Number of Days taken for Crimes to get cleared based on Zip Code

In [None]:
df_4 = dfcrime[['Zip_Code_Crime', 'Highest_NIBRS_UCR_Offense_Description', 'Report_Date', 'Clearance_Date','Clearance_Status']]
df_4a = df_4.loc[df_4['Clearance_Status'] == 'C'].reset_index(drop=True)
df_4a.head(3)

In [None]:
df_4a['Report_Date'] = pd.to_datetime(df_4a['Report_Date'] )
df_4a['Clearance_Date'] = pd.to_datetime(df_4a['Clearance_Date'] )
df_4a['Crime_Cleared_Days'] = df_4a['Clearance_Date'] - df_4a['Report_Date']
df_4a['Crime_Cleared_Days'] = df_4a['Crime_Cleared_Days'] / timedelta64(1,'D') 
df_4a.head(3)

In [None]:
dff = df_4a.groupby(['Highest_NIBRS_UCR_Offense_Description']).agg({'Crime_Cleared_Days':['mean']}).reset_index()
dff.columns = ['Crime_Type', 'Avg_Cleared_Days' ]
dff = dff.dropna().sort_values('Avg_Cleared_Days', ascending = False)
dff

In [None]:
dfff = df_4a.groupby(['Zip_Code_Crime','Highest_NIBRS_UCR_Offense_Description']).agg({'Crime_Cleared_Days':['mean']}).reset_index()
dfff.columns = ['Zip','Crime_Type', 'Avg_Cleared_Days' ]
dfff = dfff.dropna().sort_values('Avg_Cleared_Days', ascending = False)
dfff.head()

In [None]:
sns.displot(data=df_4a[df_4a.Zip_Code_Crime.isin([78759.0, 78746.0])],
            x='Crime_Cleared_Days', hue='Zip_Code_Crime', kind='kde',
            common_norm=False)
plt.savefig("CrimeCleared-ZipCode.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
#MEAN and STANDARD DEVIATION

display(f'78730.0 mean: {df_4a[df_4a.Zip_Code_Crime==78759.0].Crime_Cleared_Days.mean()}')
display(f'78736.0 mean: {df_4a[df_4a.Zip_Code_Crime==78746.0].Crime_Cleared_Days.mean()}')
display(f'78730.0 sd: {df_4a[df_4a.Zip_Code_Crime==78759.0].Crime_Cleared_Days.std()}')
display(f'78736.0 sd: {df_4a[df_4a.Zip_Code_Crime==78746.0].Crime_Cleared_Days.std()}')
display(stats.ttest_ind(df_4a[df_4a.Zip_Code_Crime == 78759.0].Crime_Cleared_Days, df_4a[df_4a.Zip_Code_Crime == 78746.0].Crime_Cleared_Days))


### Analysis-5:  Crime Count in Each District

In [None]:
df_5 = dfcrime[['District', 'Highest_NIBRS_UCR_Offense_Description', 'Report_Date', 'Clearance_Date','Clearance_Status']]
df_5_g = df_5.groupby(['District','Highest_NIBRS_UCR_Offense_Description']).agg({'Highest_NIBRS_UCR_Offense_Description':['count']}).reset_index()
df_5_g.columns = ['District','Crime_Type', 'Count' ]
df_5_g = df_5_g.dropna().sort_values('Count', ascending = False)
df_5_g.head()

In [None]:
sns.displot(data=df_5_g[df_5_g.District.isin(['D', 'G'])],
            x='Count', hue='District', kind='kde',
            common_norm=False)
plt.savefig("Distruct-CrimeCount.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
#MEAN and STANDARD DEVIATION

df = df_5_g
display(f'D mean: {df[df.District=="D"].Count.mean()}')
display(f'AP mean: {df[df.District=="G"].Count.mean()}')
display(f'D sd: {df[df.District=="D"].Count.std()}')
display(f'AP sd: {df[df.District=="G"].Count.std()}')

display(stats.ttest_ind(df[df.District == 'D'].Count, df[df.District == 'G'].Count))
