# Crime Analysis and Prediction

This file contains a set of experiments for analyzing and extrapolating data regarding crimes.

## 1. Data Loading and Sanitization

In [24]:
import csv, sys
import matplotlib.pyplot as plt
from helper import *
import numpy as np

def chicago_from(row, params=None):
    '''
    Method which takes a row from the chichago data and returns a Crime representing that row
    '''
    city = "CHICAGO"
    date_intermediate = row[3].split(" ")
    date = date_intermediate[0]
    day = get_day_of_week(date)
    time = date_intermediate[1] + date_intermediate[2]
    time_of_day = get_time_of_day(time)
    crime = row[6]
    location = (float(row[-2][2:]), float(row[-1][0:-2]))
    return Crime(city, date, day, time, time_of_day, crime, None, None, None, location)
     

config = {
    "root_data": "data/",
    "LA_crime": "crime-in-los-angeles/Crime_Data_2010_2017.csv",
    "CH_crime": "crimes-in-chicago/Chicago_Crimes_2012_to_2017.csv",
    #"LA_schema": {'date': 2, 'time': 3, 'crime': 8, 'victim_age': 10, 'victim_sex': 11, 'weapon': 16, 'location': 25},
    "CH_schema": chicago_from
}

# LA: ['DR Number', 'Date Reported', 'Date Occurred', 'Time Occurred', 'Area ID', 'Area Name', 'Reporting District', 'Crime Code', 'Crime Code Description', 'MO Codes', 'Victim Age', 'Victim Sex', 'Victim Descent', 'Premise Code', 'Premise Description', 'Weapon Used Code', 'Weapon Description', 'Status Code', 'Status Description', 'Crime Code 1', 'Crime Code 2', 'Crime Code 3', 'Crime Code 4', 'Address', 'Cross Street', 'Location']

class Crime:
    
    def __init__(self, city, date, day, raw_time, time_of_day, crime, victim_age, victim_sex, weapon, location):
        self.city = city
        self.date = date
        self.day = day
        self.raw_time = raw_time
        self.time_of_day = time_of_day
        self.crime = crime
        self.victim_age = victim_age
        self.victim_sex = victim_sex
        self.weapon = weapon
        self.location = location
        
    def get_feature_vector(self):
        '''
        Returns a feature vector of [1-hot day vector, 1-hot time vector, crime, latitude, longitude]
        Returned as strings due to the crime being a string. Age, sex, and weapon are omitted for now
        since the Chicago data set does not include those.
        '''
        day_vec = self.day[0]
        time_vec = self.time_of_day[0]
        feature = np.concatenate((day_vec, time_vec))
        others = np.array([self.crime, self.location[0], self.location[1]])
        feature = np.concatenate((feature, others))
        return feature
        
    def __str__(self):
        return self.crime + " in the " + str(self.time_of_day) + " on " + str(self.day)
        

def load_data(force_refresh=False):
    
    #all_data = {"LA": [], "CH": []}
    all_data = {"CH": []}
    
    for city in all_data.keys():
        data_file = config["root_data"] + config[city + "_crime"]
        parse_from = config[city + "_schema"]
        with open(data_file) as csvfile:
            content = csvfile.readlines()
            content = [x.strip() for x in content]
            del content[0] # Remove the header
            all_data[city].append([])
            
            # Save the number of possible rows available
            all_data[city].append(len(content))
            
            count = 0
            for row in content[0:2]:
                try:
                    d = row.split(",")
                    new_crime = parse_from(d)
                    all_data[city][0].append(new_crime)
                except:
                    #print("Unexpected error:", sys.exc_info()[0])
                    #raise
                    count = count + 1
                    
            # Save the number of errored rows
            all_data[city].append(count)
            
    return all_data

results = load_data()
print("Finished loading data")

Finished loading data


In [25]:
# A list of all Crime objects is found at results["CH"][0]
def get_feature_matrix(data):
    '''
    Returns a matrix where rows are feature vectors of 
    [1-hot day vector, 1-hot time vector, crime, latitude, longitude]
    Returned as strings due to the crime being a string. Age, sex, and weapon are omitted for now
    since the Chicago data set does not include those.
    '''
    return np.array([i.get_feature_vector() for i in data])

# Analyze how many crime types there are
def convert_crime_class_to_integer(data):
    pass # Need to do this
    #print(set([i.crime for i in results["CH"][0]]))

print(get_feature_matrix(results["CH"][0]))
print(results["CH"][0][1].get_feature_vector())

[['0.0' '1.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '1.0' '0.0'
  'BATTERY' '41.864073157' '-87.706818608']
 ['0.0' '1.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '1.0' '0.0'
  'BATTERY' '41.782921527' '-87.60436317']]
['0.0' '1.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '1.0' '0.0'
 'BATTERY' '41.782921527' '-87.60436317']
