In [44]:
# %matplotlib inline
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt

# day map used to convert days to numbers for linear regression
DAY_MAP = {'Sunday': 1, \
          'Monday': 2, \
          'Tuesday': 3, \
          'Wednesday': 4, \
          'Thursday': 5, \
          'Friday': 6, \
          'Saturday': 7}

# filter criteria (user entered)
# district = 'A1'
# type_crime = 'Larceny'
# user_day = 3
# user_hour = 22

def predictAmountCrimes(district, type_crime, user_day, user_hour):
    df = pd.read_csv('cleaned_crime_data.csv')
    
    # look at only data relevant to this district
    df_district = df[df['district'] == district]
    
    # get all possible hours and days of the week 
    hours = list(set(df_district['hour'].tolist()))
    days_of_week = list(set(df_district['day_of_week'].tolist()))
    
    # iterate through filtered dataframes to find when and how much the specified crimes occur
    relation_list = []
    for day in days_of_week:
        df_by_day = df_district[df_district['day_of_week'] == day]
        for hour in hours: 
            df_by_day_hour = df_by_day[df_by_day['hour'] == hour]
            count = 0
            for ind, row in df_by_day_hour.iterrows():
                if row['offense_type'] == type_crime:
                    count += 1
                relation_list.append({'hour': hour, 'day': DAY_MAP[day],'total_crimes': count})
    
    # convert to a dataframe to do linear regression as a predictor for total occurences of that crime
    df_crimes = pd.DataFrame.from_records(relation_list)
    
    # if nothing was found in the dataframe do do regression
    if not df_crimes.shape[0]:
        return 0
    else: 
        # perform multiple linear regression
        res = smf.ols(formula="total_crimes ~ day + hour", data=df_crimes).fit()
        print(res.summary())
        
#         fig = plt.figure(figsize=(12,8))
#         fig = sm.graphics.plot_partregress_grid(res, fig=fig)
#         plt.show()

        # make a prediction based upon the day and hour for a specific crime within a district
        day_predictions = []
        for i in range(24):
            day_predictions.append(res.predict(exog={'day':user_day, 'hour':i})[0])
        return day_predictions

#     fig = plt.figure(figsize=(10,8))
#     fig = sm.graphics.plot_partregress_grid(res,fig=fig)
#     plt.show()
# predictAmountCrimes('A1','Larceny', 3, 22)
predictAmountCrimes('A1', 'Public Drinking', 1, 23)

                            OLS Regression Results                            
Dep. Variable:           total_crimes   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     14.24
Date:                Sat, 10 Dec 2016   Prob (F-statistic):           6.90e-07
Time:                        16:06:01   Log-Likelihood:                -1874.7
No. Observations:                3957   AIC:                             3755.
Df Residuals:                    3954   BIC:                             3774.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      0.1720      0.017     10.089      0.0

[0.15621032936833079,
 0.15707489102717587,
 0.15793945268602091,
 0.15880401434486599,
 0.15966857600371107,
 0.16053313766255611,
 0.16139769932140119,
 0.16226226098024626,
 0.16312682263909131,
 0.16399138429793639,
 0.16485594595678144,
 0.16572050761562651,
 0.16658506927447159,
 0.16744963093331663,
 0.16831419259216171,
 0.16917875425100679,
 0.17004331590985183,
 0.17090787756869691,
 0.17177243922754198,
 0.17263700088638703,
 0.17350156254523211,
 0.17436612420407716,
 0.17523068586292223,
 0.17609524752176731]

In [39]:
# %matplotlib inline
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt

# day map used to convert days to numbers for linear regression
DAY_MAP = {'Sunday': 1, \
          'Monday': 2, \
          'Tuesday': 3, \
          'Wednesday': 4, \
          'Thursday': 5, \
          'Friday': 6, \
          'Saturday': 7}

# filter criteria (user entered)
# district = 'E13'
# type_crime = "Bombs/Explosives"
# user_day = 2
# user_hour = 8


def predictProbabilityOfCrime(district, type_crime, user_day, user_hour):
    # flag to avoid Perfect Seperation error (when all is_crime is either 0 or 1)
    not_available = True

    df = pd.read_csv('cleaned_crime_data.csv')
    
    # filter inital df with the specified district
    df_district = df[df['district'] == district]
    
    # find all possible hours and days for this district
    hours = list(set(df_district['hour'].tolist()))
    days_of_week = list(set(df_district['day_of_week'].tolist()))
    
    # iterate through filtered dataframes to find when the exact crimes occured in comparison to other crimes
    relation_list = []
    for day in days_of_week:
        df_by_day = df_district[df_district['day_of_week'] == day]
        for hour in hours: 
            df_by_day_hour = df_by_day[df_by_day['hour'] == hour]
            for ind, row in df_by_day_hour.iterrows():
                is_crime = 0
                if row['offense_type'] == type_crime:
                    not_available = False # switch to false, results are varied now
                    is_crime = 1    
                relation_list.append({'is_crime': is_crime, 'hour': row['hour'], 'day': DAY_MAP[row['day_of_week']]})
    
    # convert to a dataframe to perform logistic regression
    df_crimes = pd.DataFrame.from_records(relation_list)
    
    # if variation in is_crimes not found do not perform regression
    if not_available:
        return 0
    else:
        # get the logistic model
        attribute_cols = df_crimes.columns[:2]
        logit = sm.Logit(df_crimes['is_crime'], df_crimes[attribute_cols])
        result = logit.fit()
        print(result.summary())
        
        prediction_day=[]
        # make a prediction
        for i in range(24):
            prediction_day.append(result.predict([user_day, i])[0])
            
        return prediction_day

predictProbabilityOfCrime('A1', "Harassment", 6, 23)    

Optimization terminated successfully.
         Current function value: 0.143228
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:               is_crime   No. Observations:                 3957
Model:                          Logit   Df Residuals:                     3955
Method:                           MLE   Df Model:                            1
Date:                Sat, 10 Dec 2016   Pseudo R-squ.:                 -0.3678
Time:                        16:04:57   Log-Likelihood:                -566.75
converged:                       True   LL-Null:                       -414.34
                                        LLR p-value:                     1.000
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
day           -0.6879      0.044    -15.699      0.000        -0.774    -0.602
hour          -0.1108      0.

[0.015865322299298421,
 0.014225515829485136,
 0.012753000008043917,
 0.011431140163651331,
 0.010244870471122264,
 0.0091805629938781028,
 0.0082259041514576563,
 0.0073697789883985449,
 0.0066021633785807183,
 0.0059140241154004404,
 0.0052972267021190976,
 0.0047444505591634299,
 0.0042491112982408364,
 0.0038052896703679433,
 0.0034076667709162403,
 0.0030514650751596699,
 0.0027323948790003191,
 0.0024466057286879417,
 0.0021906424381577072,
 0.0019614053112925844,
 0.0017561142075717229,
 0.0015722761121145191,
 0.0014076558942522873,
 0.0012602499618431227]

In [71]:
def predictProbability(day_of_week, hour):
    return result.predict([day_of_week, hour])