# CS 439 Final Project: Crime Rate Prediction

**Contributers:** Jenya Pandu (jp2068), Vivian Zhao (vz57), Matt Yun (yy656) \
**Course:** Introduction to Data Science (01:198:439) \
**Professor:** Naina Chaturvedi \
**Semester:** Spring 2025 

### Import necessary libraries:

In [1]:
# import & setup
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# geospatial / mapping
import folium
from folium.plugins import HeatMap

# modeling
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix

# utilities
import os
%matplotlib inline

### Load and inspect data:

In [13]:
df = pd.read_csv("US_Crime_DataSet.csv", low_memory=False)
print(df.shape)

(638454, 24)


In [11]:
df.head()

Unnamed: 0,Record ID,Agency Code,Agency Name,Agency Type,City,State,Year,Month,Incident,Crime Type,...,Victim Ethnicity,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Perpetrator Ethnicity,Relationship,Weapon,Victim Count,Perpetrator Count,Record Source
0,48,AL00100,Jefferson,Sheriff,Jefferson,Alabama,1980,February,1,Murder or Manslaughter,...,Unknown,Male,43,Black,Unknown,Neighbor,Rifle,0,0,FBI
1,49,AL00100,Jefferson,Sheriff,Jefferson,Alabama,1980,March,1,Murder or Manslaughter,...,Unknown,Female,38,White,Unknown,Husband,Shotgun,0,0,FBI
2,50,AL00100,Jefferson,Sheriff,Jefferson,Alabama,1980,April,1,Murder or Manslaughter,...,Unknown,Male,66,Black,Unknown,Acquaintance,Handgun,0,0,FBI
3,51,AL00100,Jefferson,Sheriff,Jefferson,Alabama,1980,May,1,Murder or Manslaughter,...,Unknown,Male,40,White,Unknown,Stranger,Handgun,0,0,FBI
4,52,AL00100,Jefferson,Sheriff,Jefferson,Alabama,1980,June,1,Murder or Manslaughter,...,Unknown,Female,23,White,Unknown,Husband,Handgun,0,0,FBI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
638449,638450,WY01500,Park County,Sheriff,Park,Wyoming,2014,January,1,Murder or Manslaughter,...,Hispanic,Unknown,0,Unknown,Unknown,Unknown,Handgun,0,0,FBI
638450,638451,WY01700,Sheridan County,Sheriff,Sheridan,Wyoming,2014,June,1,Murder or Manslaughter,...,Unknown,Male,57,White,Unknown,Acquaintance,Handgun,0,0,FBI
638451,638452,WY01701,Sheridan,Municipal Police,Sheridan,Wyoming,2014,September,1,Murder or Manslaughter,...,Unknown,Female,22,Asian/Pacific Islander,Unknown,Daughter,Suffocation,0,0,FBI
638452,638453,WY01800,Sublette County,Sheriff,Sublette,Wyoming,2014,December,1,Murder or Manslaughter,...,Not Hispanic,Male,31,White,Not Hispanic,Stranger,Knife,0,1,FBI


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 638454 entries, 0 to 638453
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   Record ID              638454 non-null  int64 
 1   Agency Code            638454 non-null  object
 2   Agency Name            638454 non-null  object
 3   Agency Type            638454 non-null  object
 4   City                   638454 non-null  object
 5   State                  638454 non-null  object
 6   Year                   638454 non-null  int64 
 7   Month                  638454 non-null  object
 8   Incident               638454 non-null  int64 
 9   Crime Type             638454 non-null  object
 10  Crime Solved           638454 non-null  object
 11  Victim Sex             638454 non-null  object
 12  Victim Age             638454 non-null  int64 
 13  Victim Race            638454 non-null  object
 14  Victim Ethnicity       638454 non-null  object
 15  

### Data Wrangling: 

In [19]:
def preprocess(data):
    df = data.copy()
    
    df = df.drop_duplicates()
    df = df.dropna()
    df = df.reset_index(drop=True)

    # remove unnecessary cols
    df = df.drop(columns=['Agency Code', 'Agency Name', 'Agency Type'])
    df = df.drop(columns=['Victim Sex', 'Victim Age', 'Victim Race', 'Victim Ethnicity', 
                          'Perpetrator Sex', 'Perpetrator Age', 'Perpetrator Race', 'Perpetrator Ethnicity', 
                          'Relationship'])
    df = df.drop(columns=['Victim Count', 'Perpetrator Count', 'Record Source'])
    
    return df

clean_df = preprocess(df)
display(clean_df)

Unnamed: 0,Record ID,City,State,Year,Month,Incident,Crime Type,Crime Solved,Weapon
0,48,Jefferson,Alabama,1980,February,1,Murder or Manslaughter,Yes,Rifle
1,49,Jefferson,Alabama,1980,March,1,Murder or Manslaughter,Yes,Shotgun
2,50,Jefferson,Alabama,1980,April,1,Murder or Manslaughter,Yes,Handgun
3,51,Jefferson,Alabama,1980,May,1,Murder or Manslaughter,Yes,Handgun
4,52,Jefferson,Alabama,1980,June,1,Murder or Manslaughter,Yes,Handgun
...,...,...,...,...,...,...,...,...,...
638449,638450,Park,Wyoming,2014,January,1,Murder or Manslaughter,No,Handgun
638450,638451,Sheridan,Wyoming,2014,June,1,Murder or Manslaughter,Yes,Handgun
638451,638452,Sheridan,Wyoming,2014,September,1,Murder or Manslaughter,Yes,Suffocation
638452,638453,Sublette,Wyoming,2014,December,1,Murder or Manslaughter,Yes,Knife


### (Fill in):