# Crime Analysis and Prediction

This file contains a set of experiments for analyzing and extrapolating data regarding crimes.

## 1. Data Loading and Sanitization

In [8]:
import csv, sys
import matplotlib.pyplot as plt
from helper import *
import numpy as np

def chicago_from(row, params=None):
    '''
    Method which takes a row from the chichago data and returns a Crime representing that row
    '''
    city = "CHICAGO"
    date_intermediate = row[3].split(" ")
    date = date_intermediate[0]
    day = get_day_of_week(date)
    time = date_intermediate[1] + date_intermediate[2]
    time_of_day = get_time_of_day(time)
    crime = row[6]
    location = (float(row[-2][2:]), float(row[-1][0:-2]))
    return Crime(city, date, day, time, time_of_day, crime, None, None, None, location)
     

config = {
    "root_data": "data/",
    "LA_crime": "crime-in-los-angeles/Crime_Data_2010_2017.csv",
    "CH_crime": "crimes-in-chicago/Chicago_Crimes_2012_to_2017.csv",
    #"LA_schema": {'date': 2, 'time': 3, 'crime': 8, 'victim_age': 10, 'victim_sex': 11, 'weapon': 16, 'location': 25},
    "CH_schema": chicago_from,
    "CH_crime_encoding": {
        'KIDNAPPING': 0,
        'HUMAN TRAFFICKING': 0,
        'OFFSENSE INVOLVING CHILDREN': 0,
        'ROBBERY': 1,
        'BURGLARY': 1,
        'THEFT': 1,
        'MOTOR VEHICLE THEFT': 1,
        'BATTERY': 2,
        'ASSAULT': 2,
        'HOMICIDE': 2,
        'CRIM SEXUAL ASSAULT': 2,
        'SEX OFFENSE': 2,
        'NARCOTICS': 3,
        'OTHER NARCOTIC VIOLATION': 3,
        'PUBLIC PEACE VIOLATION': 4,
        'INTERFERENCE WITH PUBLIC OFFICER': 4,
        'OBSCENITY': 4,
        'PUBLIC INDECENCY': 4,
        'INTIMIDATION': 4,
        'STALKING': 4,
        'CRIMINAL TRESPASS': 4,
        'CRIMINAL DAMAGE': 5,
        'ARSON': 5,
        'NON - CRIMINAL': 6,
        'NON-CRIMINAL': 6,
        'NON-CRIMINAL (SUBJECT SPECIFIED)': 6,
        'OTHER OFFENSE': 6,
        'DECEPTIVE PRACTICE': 6,
        'CONCEALED CARRY LICENSE VIOLATION': 7,
        'WEAPONS VIOLATION': 7,
        'PROSTITUTION': 8,
        'LIQUOR LAW VIOLATION': 8,
        'GAMBLING': 8
    },
    "CH_crime_decoding": [
        "KIDNAPPING / CHILDREN",
        "ROBBERY/BURGLARY/THEFT",
        "ASSAULT/VIOLENCE",
        "NARCOTICS",
        "PUBLIC-RELATED CRIME",
        "DAMAGE/ARSON",
        "OTHER/NON-CRIMINAL",
        "WEAPON-RELATED",
        "PROHIBITIVE CRIME"
    ],
}

def convert_crime_class_to_condensed_integer_CH(c):
    index = config["CH_crime_encoding"].get(c, 6)
    result = np.zeros(9)
    result[index] = 1
    return result
    
def convert_one_hot_encoding_to_crime_class_CH(one_hot_encoding):
    index = np.argmax(one_hot_encoding > 0)
    return config["CH_crime_decoding"][index]

# LA: ['DR Number', 'Date Reported', 'Date Occurred', 'Time Occurred', 'Area ID', 'Area Name', 'Reporting District', 'Crime Code', 'Crime Code Description', 'MO Codes', 'Victim Age', 'Victim Sex', 'Victim Descent', 'Premise Code', 'Premise Description', 'Weapon Used Code', 'Weapon Description', 'Status Code', 'Status Description', 'Crime Code 1', 'Crime Code 2', 'Crime Code 3', 'Crime Code 4', 'Address', 'Cross Street', 'Location']

class Crime:
    
    def __init__(self, city, date, day, raw_time, time_of_day, crime, victim_age, victim_sex, weapon, location):
        self.city = city
        self.date = date
        self.day = day
        self.raw_time = raw_time
        self.time_of_day = time_of_day
        self.crime = crime
        self.victim_age = victim_age
        self.victim_sex = victim_sex
        self.weapon = weapon
        self.location = location
        
    def give_context(self, name, val):
        if name == "possible crimes":
            self.crime_encoder = val[0]
            self.crime_decoder = val[1]
        
    def get_feature_vector(self):
        '''
        Returns a feature vector of [1-hot day vector, 1-hot time vector, 1-hot crime encoding, latitude, longitude]
        Returned as strings due to the crime being a string. Age, sex, and weapon are omitted for now
        since the Chicago data set does not include those.
        '''
        day_vec = self.day[0]
        time_vec = self.time_of_day[0]
        feature = np.concatenate((day_vec, time_vec))
        others = np.array([self.location[0], self.location[1]])
        feature = np.concatenate((feature, convert_crime_class_to_condensed_integer_CH(self.crime)))
        feature = np.concatenate((feature, others))
        
        return feature
        
    def __str__(self):
        return self.crime + " in the " + str(self.time_of_day) + " on " + str(self.day)
        

def load_data(force_refresh=False):
    
    #all_data = {"LA": [], "CH": []}
    all_data = {"CH": []}
    
    for city in all_data.keys():
        data_file = config["root_data"] + config[city + "_crime"]
        parse_from = config[city + "_schema"]
        with open(data_file) as csvfile:
            content = csvfile.readlines()
            content = [x.strip() for x in content]
            del content[0] # Remove the header
            all_data[city].append([])
            
            # Save the number of possible rows available
            all_data[city].append(len(content))
            
            count = 0
            for row in content[0:100]:
                try:
                    d = row.split(",")
                    new_crime = parse_from(d)
                    all_data[city][0].append(new_crime)
                except:
                    #print("Unexpected error:", sys.exc_info()[0])
                    #raise
                    count = count + 1
                    
            # Save the number of errored rows
            all_data[city].append(count)
            
    return all_data

results = load_data()
print("Finished loading data")

Finished loading data


In [35]:
# A list of all Crime objects is found at results["CH"][0]
def get_feature_matrix(data):
    '''
    Returns a matrix where rows are feature vectors of 
    [1-hot day vector, 1-hot time vector, crime, crime encoding (9 DIGITS LONG), latitude, longitude,]
    Returned as strings due to the crime being a string. Age, sex, and weapon are omitted for now
    since the Chicago data set does not include those.
    '''
    return np.array([i.get_feature_vector() for i in data])

#print(set([i.crime for i in results["CH"][0]]))

def decode_result(result):
    time = np.argmax(result[0:4] > 0)
    day = np.argmax(result[4:11] > 0)
    crime = result[11:20]
    latlong = result[20:22]
    crime_de = convert_one_hot_encoding_to_crime_class_CH(crime)
    time_de = ["MORNING","AFTERNOON","EVENING","LATE NIGHT"][time]
    day_de = ["SUN","MON","TUE","WED", "THU", "FRI", "SAT"][day]
    return (time_de, day_de, crime_de, latlong)

featurized_data = get_feature_matrix(results["CH"][0])
print(featurized_data[0])
print(decode_result(featurized_data[0]))
print("Featurized matrix obtained")

[  0.           1.           0.           0.           0.           0.           0.
   0.           0.           1.           0.           0.           0.           1.
   0.           0.           0.           0.           0.           0.
  41.86407316 -87.70681861]
('AFTERNOON', 'FRI', 'ASSAULT/VIOLENCE', array([ 41.86407316, -87.70681861]))
Featurized matrix obtained
