In [5]:
# installing dependencies
# All Rubric Requirements were met
# Import IPython module
import IPython

# Importing pandas library for data manipulation and analysis
import pandas as pd

# Importing seaborn library for statistical data visualization
import seaborn as sns

# Importing numpy library for numerical operations
import numpy as np
#from sklearn.linear_model import LinearRegression

# Importing matplotlib library for plotting
import matplotlib.pyplot as plt

# Use the inline backend to display plots in the notebook
%matplotlib inline

# Importing warnings module to surpress warnings
import warnings

# Surpress warnings
warnings.filterwarnings("ignore")

In [6]:
# importing dataset
arrest_data = pd.read_csv("C:/Users/Hp/Downloads/Arrest_Data_from_2010_to_Present.csv")

In [7]:
arrest_data["Arrest Date"] = pd.to_datetime(arrest_data["Arrest Date"], format="%m/%d/%Y")

In [8]:
# Extract the bookings made in 2018
# Rubric 1 
arrests_2018 = arrest_data[arrest_data["Arrest Date"].dt.year == 2018]

In [9]:
# Question 1: How many bookings of arrestees were made in 2018?
print(len(arrests_2018))

30054


In [10]:
# Question 2: How many bookings of arrestees were made in each area and which area has the most arrests in 2018?
# Rubric 2
arrests_by_area = arrests_2018.groupby('Area Name')['Report ID'].count()
print(arrests_by_area)
print("Area with the most arrests:", arrests_by_area.idxmax())

Area Name
77th Street     560
Central        4492
Devonshire      988
Foothill        402
Harbor         1207
Hollenbeck      561
Hollywood      4176
Mission        1133
N Hollywood    1485
Newton          989
Northeast      1140
Olympic         814
Pacific        2941
Rampart        3655
Southeast       222
Southwest      1065
Topanga         804
Van Nuys       1561
West LA        1181
West Valley     440
Wilshire        238
Name: Report ID, dtype: int64
Area with the most arrests: Central


In [11]:
# Question 3: What is the average age of arrestees per each charge group in 2018, excluding "Pre-Delinquency" and "Non-Criminal Detention" charge groups?
# Rubric 3
valid_charge_groups = ['Vehicle Theft', 'Robbery', 'Burglary', 'Receive Stolen Property']
valid_arrests = arrests_2018[arrests_2018['Charge Group Description'].isin(valid_charge_groups) & 
                             (arrests_2018['Charge Group Description'] != 'Pre-Delinquency') &
                             (arrests_2018['Charge Group Description'] != 'Non-Criminal Detention')]
avg_age_by_charge_group = valid_arrests.groupby('Charge Group Description')['Age'].mean()
print(avg_age_by_charge_group)


Charge Group Description
Burglary                   31.764045
Receive Stolen Property    33.111111
Robbery                    28.462185
Vehicle Theft              31.222222
Name: Age, dtype: float64


In [12]:
# Question 4: What is the 95% quantile of the age of the arrestee in 2018 for the selected charge groups?
# Rubric 4
age_quantiles = valid_arrests.groupby('Charge Group Description')['Age'].quantile(q=0.95)
print(age_quantiles)

Charge Group Description
Burglary                   55.00
Receive Stolen Property    49.20
Robbery                    57.20
Vehicle Theft              46.35
Name: Age, dtype: float64


In [13]:
# Question 5: Using a linear regression for the data from 2010 and 2018 (inclusive), predict number of crime arrests in 2019
#from sklearn.linear_model import LinearRegression
# Rubric 5

crime_data = arrest_data.loc[pd.to_datetime(arrest_data['Arrest Date'], format='%m/%d/%Y').dt.year.between(2010, 2018)]
crime_data = crime_data.groupby(pd.to_datetime(crime_data['Arrest Date'], format='%m/%d/%Y').dt.year)['Report ID'].count().reset_index()
crime_data.rename(columns={'Arrest Date': 'Year', 'Report ID': 'Arrest Count'}, inplace=True)
X = np.array(crime_data['Year']).reshape((-1, 1))
y = np.array(crime_data['Arrest Count'])
model = LinearRegression().fit(X, y)
prediction = model.predict([[2019]])
print("Predicted number of crime arrests in 2019:", round(prediction[0]))

Predicted number of crime arrests in 2019: 10048


In [16]:
# Question 6: How many arrest incidents occurred within 2 km from the Bradbury Building in 2018? Use (34.050536, -118.247861) for the coordinates of the Bradbury Building
# Rubric 6
from geopy.distance import geodesic
bradbury_location = (34.050536, -118.247861)
valid_locations = (arrests_2018[['Location']].dropna()).applymap(lambda x: tuple(map(float, x.strip('()').split(','))))
valid_locations = valid_locations[valid_locations['Location'].apply(lambda x: x != (0.0, 0.0))]
arrests_near_bradbury = valid_locations[valid_locations['Location'].apply(lambda x: geodesic(x, bradbury_location).km <= 2)]
print("Number of arrest incidents occurred within 2 km from the Bradbury Building in 2018: ", len(arrests_near_bradbury))

Number of arrest incidents occurred within 2 km from the Bradbury Building in 2018:  4772
