In [None]:
!pip install scikit-learn
import pandas as pd
import numpy as np
from datetime import datetime
import math
from sklearn.linear_model import LinearRegression
import io
from google.colab import files
print("The environment setup is complete.")

The environment setup is complete.


In [None]:
print("Preparing to upload the file.")
uploaded = files.upload()

filename = 'Arrest_Data_from_2010_to_2019.csv'


if filename in uploaded:
    print(f"Successfully uploaded the file: {filename}")


    df = pd.read_csv(io.BytesIO(uploaded[filename]))
    print(f"Data shape: {df.shape}")
    print(f"Column name: {list(df.columns)}")


    print(f"\n Data preview:")
    print(df.head(3))

else:
    print("File upload failed. Please check if the file name is correct.")
    print(f"The uploaded files: {list(uploaded.keys())}")

Preparing to upload the file.


Saving Arrest_Data_from_2010_to_2019.csv to Arrest_Data_from_2010_to_2019.csv
Successfully uploaded the file: Arrest_Data_from_2010_to_2019.csv
Data shape: (1310127, 17)
Column name: ['Report ID', 'Arrest Date', 'Time', 'Area ID', 'Area Name', 'Reporting District', 'Age', 'Sex Code', 'Descent Code', 'Charge Group Code', 'Charge Group Description', 'Arrest Type Code', 'Charge', 'Charge Description', 'Address', 'Cross Street', 'Location']

 Data preview:
   Report ID Arrest Date    Time  Area ID  Area Name  Reporting District  Age  \
0  191811472  05/03/2019  1700.0       18  Southeast                1802   23   
1    5614161  04/29/2019  1040.0        8    West LA                 842   41   
2    5615197  04/30/2019   615.0        6  Hollywood                 663   27   

  Sex Code Descent Code  Charge Group Code Charge Group Description  \
0        F            B                NaN                      NaN   
1        M            H                3.0                  Robbery   
2    

In [None]:
print("Data preprocessing")
df['Arrest Date'] = pd.to_datetime(df['Arrest Date'], errors='coerce')
df = df[df['Arrest Date'] < '2019-01-01']
if 'Location' in df.columns:
    location_split = df['Location'].str.extract(r'\(([^,]+),\s*([^)]+)\)')
    df['Latitude'] = pd.to_numeric(location_split[0], errors='coerce')
    df['Longitude'] = pd.to_numeric(location_split[1], errors='coerce')
    df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
    print({df.shape})
    print(f" Date range: {df['Arrest Date'].min()} to {df['Arrest Date'].max()}")

Data preprocessing
{(1231627, 19)}
 Date range: 2010-01-01 00:00:00 to 2018-12-31 00:00:00


In [None]:
def haversine_distance(lat1, lon1, lat2, lon2, earth_radius=6371):
  lat1_rad = math.radians(lat1)
  lon1_rad = math.radians(lon1)
  lat2_rad = math.radians(lat2)
  lon2_rad = math.radians(lon2)

  delta_lat = lat2_rad - lat1_rad
  delta_lon = (lon2_rad - lon1_rad) * math.cos((lat1_rad + lat2_rad) / 2)

  distance = earth_radius * math.sqrt(delta_lat**2 + delta_lon**2)
  return distance


In [None]:
df_2018 = df[df['Arrest Date'].dt.year == 2018]
q1_answer = len(df_2018)
print(f"Q1.2018 arrestees bookings: {q1_answer}")

Q1.2018 arrestees bookings: 104277


In [None]:
area_counts = df_2018['Area ID'].value_counts()
q2_answer = area_counts.iloc[0]
print(f"Q2.The number of bookings of arrestees in the most arrests area in 2018: {q2_answer}")

Q2.The number of bookings of arrestees in the most arrests area in 2018: 10951


In [None]:
target_charges = ['Vehicle Theft', 'Robbery', 'Burglary', 'Receive Stolen Property']
filtered_df = df_2018[df_2018['Charge Group Description'].isin(target_charges)]
filtered_df = filtered_df.dropna(subset=['Age'])
q3_answer = filtered_df['Age'].quantile(0.95)
print(f"Q3. The  95% quantile age of the specific accusation group: {q3_answer:.2f}")

Q3. The  95% quantile age of the specific accusation group: 52.00


In [None]:
def question4(df):
    # Filter for 2018 data only
    df_2018 = df[df['Arrest Date'].dt.year == 2018].copy()

    # Remove charge groups we don't want to include
    excluded_groups = ['Pre-Delinquency', 'Non-Criminal Detention']
    df_filtered = df_2018[~df_2018['Charge Group Description'].isin(excluded_groups)]

    # Remove rows where charge group is missing or age is missing
    df_clean = df_filtered[df_filtered['Charge Group Description'].notna()]
    df_clean = df_clean[df_clean['Age'].notna()]

    print(f"Working with {len(df_clean)} records after filtering")

    # Calculate overall statistics
    overall_age_mean = df_clean['Age'].mean()
    overall_age_std = df_clean['Age'].std()

    print(f"Overall - Mean age: {overall_age_mean:.2f}, Std: {overall_age_std:.2f}")

    # We'll store z-scores for each charge group
    z_scores = {}

    # Calculate for each charge group
    charge_groups = df_clean['Charge Group Description'].unique()

    for charge in charge_groups:
        group_data = df_clean[df_clean['Charge Group Description'] == charge]
        n = len(group_data)

        # Only calculate if we have enough data points
        if n >= 10:
            group_mean = group_data['Age'].mean()

            # Standard error = standard deviation / sqrt(sample size)
            standard_error = overall_age_std / (n ** 0.5)

            # Z-score = (group_mean - overall_mean) / standard_error
            if standard_error > 0:
                z_score = (group_mean - overall_age_mean) / standard_error
                z_scores[charge] = z_score

    if not z_scores:
        print("No valid z-scores calculated")
        return 0

    # Find the largest absolute z-score
    max_abs_z = 0
    max_charge = None

    for charge, z in z_scores.items():
        abs_z = abs(z)
        if abs_z > max_abs_z:
            max_abs_z = abs_z
            max_charge = charge

    # Print some intermediate results to show my work
    print(f"\nCalculated z-scores for {len(z_scores)} charge groups")
    print(f"Largest absolute z-score: {max_abs_z:.4f}")
    print(f"Charge group with largest z-score: {max_charge}")

    # Show a few examples
    print("\nSome examples:")
    sorted_charges = sorted(z_scores.items(), key=lambda x: abs(x[1]), reverse=True)
    for i, (charge, z) in enumerate(sorted_charges[:3]):
        group_data = df_clean[df_clean['Charge Group Description'] == charge]
        print(f"  {charge}: z = {z:.3f}, n = {len(group_data)}, mean age = {group_data['Age'].mean():.2f}")

    return max_abs_z

# Run the analysis
result = question4(df)
print(f"\nFinal answer for question 4: {result:.4f}")

Working with 91544 records after filtering
Overall - Mean age: 34.98, Std: 12.86

Calculated z-scores for 25 charge groups
Largest absolute z-score: 42.5711
Charge group with largest z-score: Drunkeness

Some examples:
  Drunkeness: z = 42.571, n = 3769, mean age = 43.91
  Liquor Laws: z = 34.393, n = 3481, mean age = 42.48
  Robbery: z = -32.014, n = 2787, mean age = 27.18

Final answer for question 4: 42.5711


In [None]:
def question5(df):
    # Bradbury Building coordinates
    bradbury_lat = 34.050536
    bradbury_lon = -118.247861

    print("Starting analysis for arrests near Bradbury Building...")

    # First, let's see what location data looks like
    if 'Location' in df.columns:
        print(f"Location column found, first few values:")
        print(df['Location'].head())
    else:
        print("No Location column found, checking for latitude/longitude columns")
        # If no Location column, maybe there are separate lat/lon columns
        if 'Latitude' in df.columns and 'Longitude' in df.columns:
            print("Using existing Latitude and Longitude columns")
        else:
            print("No location data available")
            return 0

    # Try to extract coordinates from Location column if it exists
    if 'Location' in df.columns:
        # The Location column seems to have format (lat, lon)
        # Let's extract the numbers
        df_coords = df.copy()
        coords_split = df_coords['Location'].str.extract(r'\(([^,]+),\s*([^)]+)\)')
        df_coords['Latitude'] = pd.to_numeric(coords_split[0], errors='coerce')
        df_coords['Longitude'] = pd.to_numeric(coords_split[1], errors='coerce')
    else:
        df_coords = df.copy()

    # Remove rows with missing or (0,0) coordinates
    valid_locations = df_coords[
        (df_coords['Latitude'].notna()) &
        (df_coords['Longitude'].notna()) &
        (df_coords['Latitude'] != 0) &
        (df_coords['Longitude'] != 0)
    ]

    print(f"\nLocation data summary:")
    print(f"Total records: {len(df)}")
    print(f"Records with valid coordinates: {len(valid_locations)}")
    print(f"Records missing/bad coordinates: {len(df) - len(valid_locations)}")

    if len(valid_locations) == 0:
        print("No valid location data to analyze")
        return 0

    # Show some coordinate ranges
    print(f"Latitude range: {valid_locations['Latitude'].min():.4f} to {valid_locations['Latitude'].max():.4f}")
    print(f"Longitude range: {valid_locations['Longitude'].min():.4f} to {valid_locations['Longitude'].max():.4f}")

    # Calculate distances using the spherical Earth to plane projection
    # This is the formula they mentioned in the question
    earth_radius = 6371  # km

    distances = []
    close_incidents = 0

    for idx, row in valid_locations.iterrows():
        lat1 = math.radians(bradbury_lat)
        lon1 = math.radians(bradbury_lon)
        lat2 = math.radians(row['Latitude'])
        lon2 = math.radians(row['Longitude'])

        # Calculate differences
        delta_lat = lat2 - lat1
        # For longitude, account for latitude using average lat
        delta_lon = (lon2 - lon1) * math.cos((lat1 + lat2) / 2)

        # Straight-line distance on the projected plane
        distance_km = earth_radius * math.sqrt(delta_lat**2 + delta_lon**2)
        distances.append(distance_km)

        if distance_km <= 2:
            close_incidents += 1

    # Add distances to dataframe for analysis
    valid_locations = valid_locations.copy()
    valid_locations['Distance_km'] = distances

    print(f"\nDistance analysis:")
    print(f"Minimum distance: {min(distances):.3f} km")
    print(f"Maximum distance: {max(distances):.3f} km")
    print(f"Average distance: {sum(distances)/len(distances):.3f} km")
    print(f"Arrests within 2km: {close_incidents}")

    # Show some of the closest arrests
    close_arrests = valid_locations[valid_locations['Distance_km'] <= 2]
    if len(close_arrests) > 0:
        print(f"\nSome arrests within 2km:")
        close_sample = close_arrests.head(3)
        for i, (idx, row) in enumerate(close_sample.iterrows()):
            address = row.get('Address', 'Address not available')
            print(f"  {i+1}. {row['Distance_km']:.2f} km - {address}")

    # Also check how many are very close (within 0.5km)
    very_close = valid_locations[valid_locations['Distance_km'] <= 0.5]
    print(f"Arrests within 0.5km: {len(very_close)}")

    return close_incidents

# Run the analysis
print("=== Question 5: Arrests near Bradbury Building ===")
q5_result = question5(df)
print(f"\nFinal answer for Q5: {q5_result} arrests within 2km of Bradbury Building")

=== Question 5: Arrests near Bradbury Building ===
Starting analysis for arrests near Bradbury Building...
Location column found, first few values:
151    (34.1006, -118.3417)
152    (34.1006, -118.3417)
153     (34.051, -118.3548)
154    (33.9424, -118.2517)
155    (34.0761, -118.3614)
Name: Location, dtype: object

Location data summary:
Total records: 1231627
Records with valid coordinates: 1230857
Records missing/bad coordinates: 770
Latitude range: 33.3427 to 34.8146
Longitude range: -118.8513 to -117.7115

Distance analysis:
Minimum distance: 0.053 km
Maximum distance: 91.752 km
Average distance: 14.597 km
Arrests within 2km: 130015

Some arrests within 2km:
  1. 1.06 km - SPRING
  2. 0.58 km - SPRING                       ST
  3. 1.09 km - 8TH                          ST
Arrests within 0.5km: 10144

Final answer for Q5: 130015 arrests within 2km of Bradbury Building


In [None]:
def question6(df):

    # Filter for 2018 data
    df_2018 = df[df['Arrest Date'].dt.year == 2018].copy()
    print(f"Total arrests in 2018: {len(df_2018)}")

    # Look for Pico Boulevard addresses
    pico_mask = df_2018['Address'].str.contains('PICO', case=False, na=False)
    pico_arrests = df_2018[pico_mask]

    print(f"Found {len(pico_arrests)} arrests with 'Pico' in address")

    if len(pico_arrests) == 0:
        print("No Pico Boulevard arrests found")
        return 0

    # Show some example addresses to make sure we're getting the right data
    print("\nSample Pico addresses:")
    for i, addr in enumerate(pico_arrests['Address'].head(3)):
        print(f"  {i+1}. {addr}")

    # Extract coordinates from Location column
    if 'Location' not in df.columns:
        print("No Location column found!")
        return 0

    # Parse the Location column - it seems to be in format (lat, lon)
    locations = pico_arrests['Location'].str.extract(r'\(([^,]+),\s*([^)]+)\)')
    pico_arrests = pico_arrests.copy()
    pico_arrests['Latitude'] = pd.to_numeric(locations[0], errors='coerce')
    pico_arrests['Longitude'] = pd.to_numeric(locations[1], errors='coerce')

    # Remove rows with missing coordinates
    valid_coords = pico_arrests.dropna(subset=['Latitude', 'Longitude'])
    print(f"\nArrests with valid coordinates: {len(valid_coords)}")

    if len(valid_coords) < 5:
        print("Not enough data points for analysis")
        return 0

    # Show coordinate ranges
    print(f"Latitude range: {valid_coords['Latitude'].min():.4f} to {valid_coords['Latitude'].max():.4f}")
    print(f"Longitude range: {valid_coords['Longitude'].min():.4f} to {valid_coords['Longitude'].max():.4f}")

    # Remove outliers - 2 standard deviations from mean
    lat_mean = valid_coords['Latitude'].mean()
    lat_std = valid_coords['Latitude'].std()
    lon_mean = valid_coords['Longitude'].mean()
    lon_std = valid_coords['Longitude'].std()

    print(f"\nCoordinate statistics:")
    print(f"Latitude - mean: {lat_mean:.4f}, std: {lat_std:.4f}")
    print(f"Longitude - mean: {lon_mean:.4f}, std: {lon_std:.4f}")

    # Filter out outliers
    no_outliers = valid_coords[
        (valid_coords['Latitude'] >= lat_mean - 2*lat_std) &
        (valid_coords['Latitude'] <= lat_mean + 2*lat_std) &
        (valid_coords['Longitude'] >= lon_mean - 2*lon_std) &
        (valid_coords['Longitude'] <= lon_mean + 2*lon_std)
    ]

    print(f"After removing outliers: {len(no_outliers)} arrests")

    if len(no_outliers) < 2:
        print("Not enough points left after outlier removal")
        return 0

    # Find the endpoints of Pico Boulevard
    # Since it runs mostly east-west, we'll use longitude to find endpoints
    west_idx = no_outliers['Longitude'].idxmin()
    east_idx = no_outliers['Longitude'].idxmax()

    west_point = no_outliers.loc[west_idx]
    east_point = no_outliers.loc[east_idx]

    print(f"\nEndpoints of Pico Boulevard:")
    print(f"Western point: ({west_point['Latitude']:.4f}, {west_point['Longitude']:.4f})")
    print(f"  Address: {west_point['Address']}")
    print(f"Eastern point: ({east_point['Latitude']:.4f}, {east_point['Longitude']:.4f})")
    print(f"  Address: {east_point['Address']}")

    # Calculate distance between endpoints using the same method as Q5
    earth_radius = 6371  # km

    lat1 = math.radians(west_point['Latitude'])
    lon1 = math.radians(west_point['Longitude'])
    lat2 = math.radians(east_point['Latitude'])
    lon2 = math.radians(east_point['Longitude'])

    delta_lat = lat2 - lat1
    delta_lon = (lon2 - lon1) * math.cos((lat1 + lat2) / 2)

    boulevard_length = earth_radius * math.sqrt(delta_lat**2 + delta_lon**2)

    print(f"\nBoulevard length calculation:")
    print(f"Estimated length of Pico Boulevard: {boulevard_length:.2f} km")

    # Calculate arrests per kilometer
    arrests_count = len(no_outliers)
    arrests_per_km = arrests_count / boulevard_length

    print(f"\nFinal calculation:")
    print(f"Total arrests on Pico: {arrests_count}")
    print(f"Boulevard length: {boulevard_length:.2f} km")
    print(f"Arrests per km: {arrests_per_km:.2f}")

    # Let's also check if this seems reasonable
    print(f"\nSanity check - average distance between points:")
    # Calculate average spacing
    avg_spacing = boulevard_length / arrests_count if arrests_count > 0 else 0
    print(f"Average spacing: {avg_spacing:.2f} km between arrests")

    return arrests_per_km

# Run the analysis
print("Starting Pico Boulevard analysis...")
q6_result = question6(df)

print(f"Q6. Arrests per kilometer on Pico Boulevard: {q6_result:.2f}")

Starting Pico Boulevard analysis...
=== Question 6: Arrests per km on Pico Boulevard ===

Total arrests in 2018: 104277
Found 613 arrests with 'Pico' in address

Sample Pico addresses:
  1. PICO                         BL
  2. PICO                         BL
  3. PICO                         BL

Arrests with valid coordinates: 613
Latitude range: 33.9786 to 34.2567
Longitude range: -118.5689 to -118.1739

Coordinate statistics:
Latitude - mean: 34.0433, std: 0.0124
Longitude - mean: -118.3499, std: 0.0690
After removing outliers: 605 arrests

Endpoints of Pico Boulevard:
Western point: (34.0281, -118.4530)
  Address: PICO                         BL
Eastern point: (34.0188, -118.2159)
  Address: 3000 E  PICO                         BL

Boulevard length calculation:
Estimated length of Pico Boulevard: 21.88 km

Final calculation:
Total arrests on Pico: 605
Boulevard length: 21.88 km
Arrests per km: 27.66

Sanity check - average distance between points:
Average spacing: 0.04 km between ar

In [None]:
def question7(df):

    # First, let's understand the data we're working with
    print("Initial data overview:")
    print(f"Total records: {len(df)}")
    print(f"Columns available: {list(df.columns)}")

    # Filter for records before 2019
    df_pre2019 = df[df['Arrest Date'] < '2019-01-01']
    print(f"Records before 2019: {len(df_pre2019)}")

    # Check for missing charge group codes and Area IDs
    missing_charge = df_pre2019['Charge Group Code'].isna().sum()
    missing_area = df_pre2019['Area ID'].isna().sum()
    print(f"Records missing charge code: {missing_charge}")
    print(f"Records missing area ID: {missing_area}")

    # Filter out records without charge group code or area ID
    valid_data = df_pre2019[
        (df_pre2019['Charge Group Code'].notna()) &
        (df_pre2019['Area ID'].notna())
    ].copy()

    # Also remove charge group code 99 as specified
    valid_data = valid_data[valid_data['Charge Group Code'] != 99]

    print(f"Valid records for analysis: {len(valid_data)}")

    if len(valid_data) == 0:
        print("No valid data to analyze")
        return 0

    # Let's see what charge groups and areas we have
    charge_counts = valid_data['Charge Group Code'].value_counts()
    area_counts = valid_data['Area ID'].value_counts()

    print(f"\nCharge group distribution (top 5):")
    for code, count in charge_counts.head().items():
        print(f"Code {code}: {count} arrests ({count/len(valid_data):.1%})")

    print(f"\nArea distribution (top 5):")
    for area, count in area_counts.head().items():
        print(f"Area {area}: {count} arrests ({count/len(valid_data):.1%})")

    # Calculate city-wide probabilities for each charge group
    total_city_arrests = len(valid_data)
    city_charge_counts = valid_data['Charge Group Code'].value_counts()
    city_probs = {}

    for charge_code, count in city_charge_counts.items():
        city_probs[charge_code] = count / total_city_arrests

    print(f"\nCity-wide probabilities calculated for {len(city_probs)} charge groups")

    # Now calculate ratios for each area-charge combination
    ratios = []
    area_charge_combinations = []

    areas_analyzed = 0
    combinations_analyzed = 0

    for area_id in valid_data['Area ID'].unique():
        area_data = valid_data[valid_data['Area ID'] == area_id]
        area_total = len(area_data)

        # Skip areas with very few arrests
        if area_total < 50:
            continue

        areas_analyzed += 1

        for charge_code in area_data['Charge Group Code'].unique():
            charge_in_area = len(area_data[area_data['Charge Group Code'] == charge_code])

            # Only consider if we have enough data points
            if charge_in_area >= 10:
                # Calculate probability in this area
                prob_in_area = charge_in_area / area_total

                # Get city-wide probability
                prob_citywide = city_probs.get(charge_code, 0)

                # Avoid division by zero and very small probabilities
                if prob_citywide > 0.0001:
                    ratio = prob_in_area / prob_citywide
                    ratios.append(ratio)
                    area_charge_combinations.append((area_id, charge_code, ratio))
                    combinations_analyzed += 1

    print(f"\nAnalysis results:")
    print(f"Areas analyzed: {areas_analyzed}")
    print(f"Area-charge combinations analyzed: {combinations_analyzed}")

    if len(ratios) == 0:
        print("No valid ratios calculated")
        return 0

    # Show some statistics about the ratios
    print(f"Ratio statistics:")
    print(f"Minimum ratio: {min(ratios):.2f}")
    print(f"Maximum ratio: {max(ratios):.2f}")
    print(f"Average ratio: {sum(ratios)/len(ratios):.2f}")

    # Find the top 5 ratios
    sorted_combinations = sorted(area_charge_combinations, key=lambda x: x[2], reverse=True)
    top_5 = sorted_combinations[:5]

    print(f"\nTop 5 disproportionate area-charge combinations:")
    for i, (area, charge, ratio) in enumerate(top_5, 1):
        area_count = len(valid_data[valid_data['Area ID'] == area])
        charge_count = len(valid_data[valid_data['Charge Group Code'] == charge])
        charge_in_area = len(valid_data[(valid_data['Area ID'] == area) &
                                      (valid_data['Charge Group Code'] == charge)])

        print(f"{i}. Area {area}, Charge {charge}: ratio = {ratio:.2f}")
        print(f"   ({charge_in_area}/{area_count} in area vs {charge_count}/{total_city_arrests} city-wide)")

    # Calculate average of top 5 ratios
    top_5_ratios = [ratio for _, _, ratio in top_5]
    average_top_5 = sum(top_5_ratios) / len(top_5_ratios)

    print(f"\nAverage of top 5 ratios: {average_top_5:.2f}")

    # Let's also check what the ratios mean
    print(f"\nInterpretation:")
    print(f"A ratio of {average_top_5:.1f} means these charge groups occur")
    print(f"{average_top_5:.1f} times more often in these areas than expected city-wide")

    return average_top_5

# Run the analysis
print("Starting disproportionate rate analysis...")
q7_result = question7(df)

print(f"Q7. Average of top 5 ratios: {q7_result:.2f}")

Starting disproportionate rate analysis...
=== Question 7: Disproportionate Arrest Rates ===

Initial data overview:
Total records: 1231627
Columns available: ['Report ID', 'Arrest Date', 'Time', 'Area ID', 'Area Name', 'Reporting District', 'Age', 'Sex Code', 'Descent Code', 'Charge Group Code', 'Charge Group Description', 'Arrest Type Code', 'Charge', 'Charge Description', 'Address', 'Cross Street', 'Location', 'Latitude', 'Longitude']
Records before 2019: 1231627
Records missing charge code: 79907
Records missing area ID: 0
Valid records for analysis: 1151713

Charge group distribution (top 5):
Code 24.0: 229826 arrests (20.0%)
Code 16.0: 153432 arrests (13.3%)
Code 18.0: 113420 arrests (9.8%)
Code 22.0: 112996 arrests (9.8%)
Code 4.0: 77964 arrests (6.8%)

Area distribution (top 5):
Area 1: 116368 arrests (10.1%)
Area 6: 102986 arrests (8.9%)
Area 14: 74641 arrests (6.5%)
Area 12: 65701 arrests (5.7%)
Area 3: 63703 arrests (5.5%)

City-wide probabilities calculated for 28 charge gr