In [11]:
# %matplotlib inline
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt

# day map used to convert days to numbers for linear regression
DAY_MAP = {'Sunday': 1, \
          'Monday': 2, \
          'Tuesday': 3, \
          'Wednesday': 4, \
          'Thursday': 5, \
          'Friday': 6, \
          'Saturday': 7}

# filter criteria (user entered)
# district = 'A1'
# type_crime = 'Larceny'
# user_day = 3
# user_hour = 22

def predictAmountCrimes(district, type_crime, user_day, user_hour):
    df = pd.read_csv('cleaned_crime_data.csv')
    
    # look at only data relevant to this district
    df_district = df[df['district'] == district]
    
    # get all possible hours and days of the week 
    hours = list(set(df_district['hour'].tolist()))
    days_of_week = list(set(df_district['day_of_week'].tolist()))
    
    # iterate through filtered dataframes to find when and how much the specified crimes occur
    relation_list = []
    for day in days_of_week:
        df_by_day = df_district[df_district['day_of_week'] == day]
        for hour in hours: 
            df_by_day_hour = df_by_day[df_by_day['hour'] == hour]
            count = 0
            for ind, row in df_by_day_hour.iterrows():
                if row['offense_type'] == type_crime:
                    count += 1
            if count != 0:
                relation_list.append({'hour': hour, 'day': DAY_MAP[day],'total_crimes': count})
    
    # convert to a dataframe to do linear regression as a predictor for total occurences of that crime
    df_crimes = pd.DataFrame.from_records(relation_list)
    
    # if nothing was found in the dataframe do do regression
    if not df_crimes.shape[0]:
        return 0
    else: 
        # perform multiple linear regression
        res = smf.ols(formula="total_crimes ~ day + hour", data=df_crimes).fit()
#         print(res.summary())
        
        # make a prediction based upon the day and hour for a specific crime within a district
        day_predictions = []
        for i in range(24):
            day_predictions.append(res.predict(exog={'day':user_day, 'hour':i})[0])
        return day_predictions

#     fig = plt.figure(figsize=(10,8))
#     fig = sm.graphics.plot_partregress_grid(res,fig=fig)
#     plt.show()
# predictAmountCrimes('A1','Larceny', 3, 22)
predictAmountCrimes('A1', 'Rape/Sexual Assault', 3, 22)

[0.95191413649673784,
 0.96014811107234344,
 0.96838208564794903,
 0.97661606022355452,
 0.98485003479916011,
 0.99308400937476571,
 1.0013179839503712,
 1.0095519585259769,
 1.0177859331015824,
 1.0260199076771881,
 1.0342538822527936,
 1.0424878568283993,
 1.0507218314040048,
 1.0589558059796103,
 1.067189780555216,
 1.0754237551308214,
 1.0836577297064269,
 1.0918917042820326,
 1.1001256788576381,
 1.1083596534332438,
 1.1165936280088493,
 1.124827602584455,
 1.1330615771600605,
 1.141295551735666]

In [9]:
# %matplotlib inline
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt

# day map used to convert days to numbers for linear regression
DAY_MAP = {'Sunday': 1, \
          'Monday': 2, \
          'Tuesday': 3, \
          'Wednesday': 4, \
          'Thursday': 5, \
          'Friday': 6, \
          'Saturday': 7}

# filter criteria (user entered)
# district = 'E13'
# type_crime = "Bombs/Explosives"
# user_day = 2
# user_hour = 8


def predictProbabilityOfCrime(district, type_crime, user_day, user_hour):
    # flag to avoid Perfect Seperation error (when all is_crime is either 0 or 1)
    not_available = True

    df = pd.read_csv('cleaned_crime_data.csv')
    
    # filter inital df with the specified district
    df_district = df[df['district'] == district]
    
    # find all possible hours and days for this district
    hours = list(set(df_district['hour'].tolist()))
    days_of_week = list(set(df_district['day_of_week'].tolist()))
    
    # iterate through filtered dataframes to find when the exact crimes occured in comparison to other crimes
    relation_list = []
    for day in days_of_week:
        df_by_day = df_district[df_district['day_of_week'] == day]
        for hour in hours: 
            df_by_day_hour = df_by_day[df_by_day['hour'] == hour]
            for ind, row in df_by_day_hour.iterrows():
                is_crime = 0
                if row['offense_type'] == type_crime:
                    not_available = False # switch to false, results are varied now
                    is_crime = 1    
                relation_list.append({'is_crime': is_crime, 'hour': row['hour'], 'day': DAY_MAP[row['day_of_week']]})
    
    # convert to a dataframe to perform logistic regression
    df_crimes = pd.DataFrame.from_records(relation_list)
    
    # if variation in is_crimes not found do not perform regression
    if not_available:
        return 0
    else:
        # get the logistic model
        attribute_cols = df_crimes.columns[:2]
        logit = sm.Logit(df_crimes['is_crime'], df_crimes[attribute_cols])
        result = logit.fit()
#         print(result.summary())
        
        prediction_day=[]
        # make a prediction
        for i in range(24):
            prediction_day.append(result.predict([user_day, i])[0])
            
        return prediction_day

# predictProbabilityOfCrime('E13', "Bombs/Explosives", 2, 8)    

Optimization terminated successfully.
         Current function value: 0.018348
         Iterations 11


[0.0674458146240422,
 0.051669158240292869,
 0.039426889788557413,
 0.029993511611684575,
 0.02276369979130342,
 0.017245620656004702,
 0.013047303399565077,
 0.0098607799984407191,
 0.0074466245710720982,
 0.0056201574523698046,
 0.0042397625397196225,
 0.0031973231432931397,
 0.0024105700532649869,
 0.0018170573916781712,
 0.0013694744059256068,
 0.0010320274054760954,
 0.00077766469132824298,
 0.0005859577159197689,
 0.0004414887426993723,
 0.00033262701080937098,
 0.00025060153625634416,
 0.00018879963813191235,
 0.00014223679744829832,
 0.00010715631862031134]

In [71]:
def predictProbability(day_of_week, hour):
    return result.predict([day_of_week, hour])