In [83]:
%matplotlib inline
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt

# day map used to convert days to numbers for linear regression
DAY_MAP = {'Sunday': 1, \
          'Monday': 2, \
          'Tuesday': 3, \
          'Wednesday': 4, \
          'Thursday': 5, \
          'Friday': 6, \
          'Saturday': 7}

# filter criteria (user entered)
district = 'A1'
type_crime = ['Bombs/Explosives']
user_day = 3
user_hour = 22

if __name__ == "__main__":
    df = pd.read_csv('cleaned_crime_data.csv')
    
    # look at only data relevant to this district
    df_district = df[df['district'] == district]
    
    # get all possible hours and days of the week 
    hours = list(set(df_district['hour'].tolist()))
    days_of_week = list(set(df_district['day_of_week'].tolist()))
    
    # iterate through filtered dataframes to find when and how much the specified crimes occur
    relation_list = []
    for day in days_of_week:
        df_by_day = df_district[df_district['day_of_week'] == day]
        for hour in hours: 
            df_by_day_hour = df_by_day[df_by_day['hour'] == hour]
            count = 0
            for ind, row in df_by_day_hour.iterrows():
                if row['offense_type'] in type_crime:
                    count += 1
            if count != 0:
                relation_list.append({'hour': hour, 'day': DAY_MAP[day],'total_crimes': count})
    
    # convert to a dataframe to do linear regression as a predictor for total occurences of that crime
    df_crimes = pd.DataFrame.from_records(relation_list)
    
    # if nothing was found in the dataframe do do regression
    if not df_crimes.shape[0]:
        print('Data was not found')
    else: 
        # perform multiple linear regression
        res = smf.ols(formula="total_crimes ~ day + hour", data=df_crimes).fit()
        print(res.summary())
        
        # make a prediction based upon the day and hour for a specific crime within a district
        print(res.predict(exog={'day':user_day, 'hour':user_hour}))

#     fig = plt.figure(figsize=(10,8))
#     fig = sm.graphics.plot_partregress_grid(res,fig=fig)
#     plt.show()
    

                            OLS Regression Results                            
Dep. Variable:           total_crimes   R-squared:                        -inf
Model:                            OLS   Adj. R-squared:                   -inf
Method:                 Least Squares   F-statistic:                    -3.500
Date:                Sat, 19 Nov 2016   Prob (F-statistic):               1.00
Time:                        20:25:30   Log-Likelihood:                 336.97
No. Observations:                  10   AIC:                            -667.9
Df Residuals:                       7   BIC:                            -667.0
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      1.0000   7.73e-16   1.29e+15      0.0

  "anyway, n=%i" % int(n))


In [85]:
%matplotlib inline
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt

# day map used to convert days to numbers for linear regression
DAY_MAP = {'Sunday': 1, \
          'Monday': 2, \
          'Tuesday': 3, \
          'Wednesday': 4, \
          'Thursday': 5, \
          'Friday': 6, \
          'Saturday': 7}

# filter criteria (user entered)
district = 'B3'
type_crime = ["Arson"]
user_day = 1
user_hour = 22

# flag to avoid Perfect Seperation error (when all is_crime is either 0 or 1)
not_available = True

if __name__ == "__main__":
    df = pd.read_csv('cleaned_crime_data.csv')
    
    # filter inital df with the specified district
    df_district = df[df['district'] == district]
    
    # find all possible hours and days for this district
    hours = list(set(df_district['hour'].tolist()))
    days_of_week = list(set(df_district['day_of_week'].tolist()))
    
    # iterate through filtered dataframes to find when the exact crimes occured in comparison to other crimes
    relation_list = []
    for day in days_of_week:
        df_by_day = df_district[df_district['day_of_week'] == day]
        for hour in hours: 
            df_by_day_hour = df_by_day[df_by_day['hour'] == hour]
            for ind, row in df_by_day_hour.iterrows():
                is_crime = 0
                if row['offense_type'] in type_crime:
                    not_available = False # switch to false, results are varied now
                    is_crime = 1    
                relation_list.append({'is_crime': is_crime, 'hour': row['hour'], 'day': DAY_MAP[row['day_of_week']]})
    
    # convert to a dataframe to perform logistic regression
    df_crimes = pd.DataFrame.from_records(relation_list)
    
    # if variation in is_crimes not found do not perform regression
    if not_available:
        print('Data was not found')
    else:
        # get the logistic model
        attribute_cols = df_crimes.columns[:2]
        logit = sm.Logit(df_crimes['is_crime'], df_crimes[attribute_cols])
        result = logit.fit()
        print(result.summary())
        
        # make a prediction
        print(result.predict([user_day, user_hour])) 
    

Optimization terminated successfully.
         Current function value: 0.016563
         Iterations 12
                           Logit Regression Results                           
Dep. Variable:               is_crime   No. Observations:                 3182
Model:                          Logit   Df Residuals:                     3180
Method:                           MLE   Df Model:                            1
Date:                Sat, 19 Nov 2016   Pseudo R-squ.:                 -0.7159
Time:                        20:30:07   Log-Likelihood:                -52.702
converged:                       True   LL-Null:                       -30.713
                                        LLR p-value:                     1.000
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
day           -1.9239      0.290     -6.636      0.000        -2.492    -1.356
hour          -0.2075      0

In [71]:
def predictProbability(day_of_week, hour):
    return result.predict([day_of_week, hour])