In [1]:
# Std lib:
import warnings
import os

# apriori algorithm lib
from apyori import apriori

# To hide environment variables
from dotenv import load_dotenv

# Data manipulation:
import numpy as np
import pandas as pd
import geopandas
from shapely.geometry import Point, Polygon

# Visualization:
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
%matplotlib inline
style.use('seaborn')

# Display all columns in Jupyter:
from IPython.display import display
pd.options.display.max_columns = None

# Filter Warnings
warnings.filterwarnings('ignore')

# Load Data

In [2]:
datatypes = {'block': 'object',
             'iucr': 'object',
             'primary_type': 'object',
             'description': 'object',
             'location_description': 'object',
             'arrest': 'bool',
             'domestic': 'bool',
             'beat': 'int64',
             'district': 'float64',
             'ward': 'float64',
             'community_area': 'int64',
             'fbi_code': 'object',
             'x_coordinate': 'float64',
             'y_coordinate': 'float64',
             'year': 'int64',
             'latitude': 'float64',
             'longitude': 'float64',
             'location': 'object',
             'month': 'uint8',
             'hour': 'uint8',
             'dayofweek': 'uint8',
             'weekend': 'bool',
             'CF': 'float64',
             'CF3': 'float64',
             'PI': 'float64',
             'EUH': 'float64',
             'CH': 'float64',
             'SOI': 'float64',
             'crime_code_category': 'category',
             'index_crime': 'bool',
             'violent_crime': 'bool',
             'property_crime': 'bool',
             'crime_against_persons': 'bool',
             'crime_against_property': 'bool',
             'crime_against_society': 'bool'}

In [3]:
%%time
load_dotenv()
path_to_data = os.environ.get('CLEAN_DATA')
df = pd.read_csv(path_to_data, dtype=datatypes)

CPU times: user 37.5 s, sys: 4.82 s, total: 42.3 s
Wall time: 44.8 s


# Prepare Data for Mining

#### Filter out attributes that will not be considered for frequent itemset analysis

In [4]:
attributes = [ "crime_against_persons", "crime_against_property", "crime_against_society", \
              "location_description", "description", "crime_code_category", "arrest", \
              "domestic", "district", "ward", "community_area", "index_crime", "beat", \
              "violent_crime", "month", "hour", "dayofweek", "weekend", "block", "year"]

data = df[attributes]

#### Remove extraneous address characters from 'block' attribute to validate its use in frequent itemset mining

In [5]:
%%time
data['block'] = data['block'].str[6:]
data['block'].head()

CPU times: user 1.99 s, sys: 160 ms, total: 2.15 s
Wall time: 2.19 s


0                        E 84TH ST
1                        W 57TH ST
2                         S MAY ST
3                  S BALTIMORE AVE
4    S DR MARTIN LUTHER KING JR DR
Name: block, dtype: object

#### Convert column values to numbers to speed up frequent itemset mining

In [6]:
%%time
###### BEWARE -- 12+ min to run this block. 

data_num = data.copy()
val_dict = {}
index = 0

for attribute in attributes:
    # Important! Reset so column vals don't cross-pollinate:
    num_dict = {}
    
    # Get all unique values for column:
    values = pd.unique(data[attribute])
    
    # For each unique value replace with an index no.
    for value in values:
        # add index no. as key to value for later reference:
        val_dict[index] = (attribute, value)
        num_dict[value] = index
        index += 1
    # Replace per column to avoid cross-pollination.
    data_num[attribute] = data_num[attribute].replace(num_dict)
    
######

# verify results
data_num.head()

CPU times: user 17min 24s, sys: 1min 6s, total: 18min 30s
Wall time: 18min 46s


Unnamed: 0,crime_against_persons,crime_against_property,crime_against_society,location_description,description,crime_code_category,arrest,domestic,district,ward,community_area,index_crime,beat,violent_crime,month,hour,dayofweek,weekend,block,year
0,0,2,4,6,220,748,774,776,778.0,802.0,852,930,932,1235,1237,1249,1273,1280,1282,4577
1,0,2,4,7,221,748,775,776,779.0,803.0,853,930,933,1235,1237,1250,1274,1280,1283,4577
2,0,2,4,8,221,748,775,776,780.0,804.0,854,930,934,1235,1237,1251,1275,1280,1284,4577
3,0,2,4,7,220,748,774,776,778.0,805.0,852,930,935,1235,1237,1252,1275,1280,1285,4577
4,0,2,4,9,221,748,774,777,781.0,806.0,855,930,936,1235,1237,1253,1276,1281,1286,4577


In order to use the apyori library, we will need to be able to convert dataframes into list of lists:

In [7]:
def convert_df_2_list(df):
    return df.to_numpy().tolist()

As also need a means of formatting and returning our results as a dataframe with human-readable data:

In [8]:
def df_results(association_results):
    
    dataframes = []
    for rule in association_results:
        data = {}
        
        # create a rule from results
        base = list(rule[2][0][0])
        base = [val_dict[b][1] for b in base]
        add = list(rule[2][0][1])
        add = val_dict[add[0]][1]
        data['rule'] = str(base) + ' ==> ' + str(add)

        # support, confidence, and lift of rule
        data['lift'] = rule[2][0][3]
        data['confidence'] = rule[2][0][2]
        data['support'] = rule[1]

        # components of rule
        for item in rule[0]:
            data[val_dict[item][0]] = val_dict[item][1]
        
        # convert rules to dataframe rows
        dataframe = pd.DataFrame.from_records([data])
        dataframes.append(dataframe)
    
        # concat rules as rows of single dataframe
    return pd.concat(dataframes, ignore_index=True, axis=0)

And finally we want a wrapper function that converts the dataframe, applies the apriori class, and converts the results to a dataframe:

In [9]:
def use_apriori(df, supp, conf, lift):
    records = convert_df_2_list(df)
    association_rules = apriori(records, min_support=supp, min_confidence=conf, min_lift=lift)
    association_results = list(association_rules)
    return df_results(association_results)

CITATION:
Regarding use of the apyori lib see our references here:
- https://github.com/ymoch/apyori
- https://stackabuse.com/association-rule-mining-via-apriori-algorithm-in-python/
- pydoc entry:<br>
    ```
    apyori.apriori = apriori(transactions, **kwargs)
        Executes Apriori algorithm and returns a RelationRecord generator.

        Arguments:
            transactions -- A transaction iterable object
                            (eg. [['A', 'B'], ['B', 'C']]).

        Keyword arguments:
            min_support -- The minimum support of relations (float).
            min_confidence -- The minimum confidence of relations (float).
            min_lift -- The minimum lift of relations (float).
            max_length -- The maximum length of the relation (integer).
    ```

# Parameters and Apriori

#### Identify interesting subsets of the data

Because we have chosen twenty attributes to mine for frequent itemsets, many of which have vary in form and frequency, it would be impractical to mine them all at once. Instead we need to select subsets of our attributes. However, with twenty total attributes we have 1,048,575 possible combinations. So, we need to be selective. 
<br><br>
There are several ways we can look at our attributes when deciding which to combine. One is to look at the the number of unique values each attribute contains. This has the added benefit of helping us later when we are setting the minimum support, confidence, and lift values as parameters to the apriori algorithm. Even better, we can also take a look at how often the two least frequent unique values occur, which will give us a major clue how low we may need to lower the minimum support in order for certain unique values to not be automatically omitted from our results. Given that we have 6,508,416 rows of data total, if some of these attributes have very low frequency counts for unique values, then that means we may need to work our way down to very low support levels for certain columns in order to reveal certain frequent itemsets that may still have high confidence and lift. 

In [10]:
unq, frq1, frq2, perc1, perc2 = [], [], [], [], []
for attribute in attributes:
    unq.append(len(pd.unique(data[attribute])))
    f1 = data[attribute].value_counts(ascending=True).iloc[0]
    frq1.append(f1)
    f2 = data[attribute].value_counts(ascending=True).iloc[1]
    frq2.append(f2)
    perc1.append(str(round((f1 / 6508416)*100,4)) + '%')
    perc2.append(str(round((f2 / 6508416)*100,4)) + '%')
display(pd.DataFrame({'attributes': attributes, 'unique vals': unq, \
                      'count least frequent': frq1, \
                      '% of total': perc1, \
                      'count 2nd least': frq2, \
                      '2nd % of total': perc2} \
                    ).set_index('attributes').sort_values(by=['unique vals']))

Unnamed: 0_level_0,unique vals,count least frequent,% of total,count 2nd least,2nd % of total
attributes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
crime_against_persons,2,1713049,26.3205%,4795367,73.6795%
crime_against_property,2,3200474,49.1744%,3307942,50.8256%
crime_against_society,2,1543323,23.7127%,4965093,76.2873%
weekend,2,1817210,27.9209%,4691206,72.0791%
arrest,2,1762303,27.0773%,4746113,72.9227%
domestic,2,881185,13.5392%,5627231,86.4608%
violent_crime,2,565845,8.6941%,5942571,91.3059%
index_crime,2,2622479,40.2937%,3885937,59.7063%
dayofweek,7,884921,13.5966%,918481,14.1122%
month,12,430112,6.6086%,483102,7.4227%



Finally, another way we can also look at the different attributes to decide what combinations will be most useful is in terms of what information they provide given the context. By doing that, we can roughly divide them into three groups: the WHAT, WHERE, and WHEN of a crime. 

| WHAT | WHERE | WHEN |
|:------|:-------|:------|
|crime_against_persons|district|month|
|crime_against_property|ward|hour|
|description|community_area|dayofweek|
|location_description|beat|weekend|
|crime_code_category|block|year|
|arrest|||
|domestic|||
|index_crime|||
|violent_crime|||

With this information in hand, we can make some more informed decisions about what subsets will be the most salient and/or provide the best results due to similarities in the unique value counts and contextual differences and similarities. We will want to avoid including attributes that are too similar to each other, or otherwise overlap, while also avoid using attributes together that are too different in terms of their unique items counts.<br>
<br>
Using these criteria to narrow data subsets down, we define the list of candidates we will consider here:

In [14]:
##### Create dataframe subsets for frequent itemset mining

bk_dn = data_num[["block", "description"]]
bk_ld = data_num[["block", "location_description"]]
bt_dn = data_num[["beat", "description"]]
bt_ld = data_num[["beat", "location_description"]]
ld_dn = data_num[["description", "location_description"]]
ca_dn = data_num[["community_area", "description"]]
ca_ld = data_num[["community_area", "location_description"]]
ca_hr = data_num[["community_area", "hour"]]
ld_ca_ccc = data_num[["community_area", "location_description", "crime_code_category"]]
ca_cape_capo_cas = data_num[["community_area", "crime_against_persons", "crime_against_property", "crime_against_society"]]
ld_cape_capo_cas = data_num[["location_description", "crime_against_persons", "crime_against_property", "crime_against_society"]]
wkn_cape_capo_cas = data_num[["weekend", "crime_against_persons", "crime_against_property", "crime_against_society"]]
hr_cape_capo_cas = data_num[["hour", "crime_against_persons", "crime_against_property", "crime_against_society"]]
arr_dom_idx_vc = data_num[["arrest", "domestic", "index_crime", "violent_crime"]]
ca_cape_capo_cas_arr = data_num[["community_area", "crime_against_persons", "crime_against_property", "crime_against_society", "arrest"]]
ca_arr_dom_idx_vc = data_num[["community_area", "arrest", "domestic", "index_crime", "violent_crime"]]
ccc_hr_yr_mon_dow_dc = data_num[["crime_code_category", "hour", "year", "month", "dayofweek", "district"]]
ccc_hr_yr_mon_dow_wd = data_num[["crime_code_category", "hour", "year", "month", "dayofweek", "ward"]]
ccc_hr_yr_mon_dow_ca = data_num[["crime_code_category", "hour", "year", "month", "dayofweek", "community_area"]]

#####

#### Applying Apriori
Finally, with the subsets of data we want to explore the most prepared, we can begin iteratively applying our application of the apriori alogorithm, adjusting the parameters as we go, in pursuit of interesting information. The final forumula for applying the apriori to a subset of the data is provided here, using the ld_ca_ccc data subset as our example:

In [15]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = ld_ca_ccc
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
demo_results = use_apriori(dataframe_to_analyze, minimum_support, minimum_confidence, minimum_lift)
print(demo_results.shape[0], "rules found.")
print()

# display results
demo_results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

8 rules found.

CPU times: user 23.6 s, sys: 1.3 s, total: 24.9 s
Wall time: 25.3 s


Unnamed: 0,rule,lift,confidence,support,crime_code_category,location_description,community_area
6,['RESIDENCE-GARAGE'] ==> Burglary,8.856286,0.505816,0.009495,Burglary,RESIDENCE-GARAGE,
7,"[25, 'SIDEWALK'] ==> Drug Abuse",4.930013,0.46883,0.005493,Drug Abuse,SIDEWALK,25.0
5,['DEPARTMENT STORE'] ==> Larceny,3.866943,0.819444,0.010368,Larceny,DEPARTMENT STORE,
3,['Prostitution'] ==> STREET,3.19793,0.819378,0.007665,Prostitution,STREET,
0,['SIDEWALK'] ==> Drug Abuse,3.182014,0.302601,0.03052,Drug Abuse,SIDEWALK,
1,['Robbery'] ==> SIDEWALK,3.149042,0.31761,0.011989,Robbery,SIDEWALK,
2,['Motor Vehicle Theft'] ==> STREET,3.074678,0.787798,0.035736,Motor Vehicle Theft,STREET,
4,['GROCERY FOOD STORE'] ==> Larceny,3.025541,0.641142,0.008058,Larceny,GROCERY FOOD STORE,


# Results
Applying the apriori algorithm is an extremely iterative process. Besides the many, many different subsets of data we can choose from, there is an almost infinite number of tweaks that can be made to the minimum support, minimum confidence, and minimum lift parameters for each one. Each results must then be interpreted by human user to analyze the validity, usefulness, and interestingness. Furthermore, it must be noted that there will always be inherent biases in the data, hence there will be biases in the results. Whether that bias is due to some information that is missing, or bias in how the information itself was originally gathered, we cannot take any results as a penultimate truth. The results shown below are therefore not exhaustive, and only meant to show some of the more interesting results we found and only a small sliver of what is possible. The parameters for each result provided below has been fine tuned for each case, but in each case other possibilities exist depending on the information that one wants to show. Copy and paste these into the Sandbox section to make adjustments and drill down on information you would like to explore further. 

### block, description

In [None]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = bk_dn
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
bk_dn_results = use_apriori(dataframe_to_analyze, minimum_support, minimum_confidence, minimum_lift)
print(bk_dn_results.shape[0], "rules found.")
print()

# display results
#bk_dn_results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

### block, location_description

In [None]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = bk_ld
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
bk_ld_results = use_apriori(dataframe_to_analyze, minimum_support, minimum_confidence, minimum_lift)
print(bk_ld_results.shape[0], "rules found.")
print()

# display results
#bk_ld_results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

### beat, description

In [None]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = bt_dn
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
bt_dn_results = use_apriori(dataframe_to_analyze, minimum_support, minimum_confidence, minimum_lift)
print(bt_dn_results.shape[0], "rules found.")
print()

# display results
#bt_dn_results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

### beat, location_description

In [None]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = bt_ld
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
bt_ld_results = use_apriori(dataframe_to_analyze, minimum_support, minimum_confidence, minimum_lift)
print(bt_ld_results.shape[0], "rules found.")
print()

# display results
#bt_ld_results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

### location, location_description

In [None]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = ld_dn
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
ld_dn_results = use_apriori(dataframe_to_analyze, minimum_support, minimum_confidence, minimum_lift)
print(ld_dn_results.shape[0], "rules found.")
print()

# display results
#ld_dn_results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

### community area, description

In [None]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = ca_dn
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
ca_dn_results = use_apriori(dataframe_to_analyze, minimum_support, minimum_confidence, minimum_lift)
print(ca_dn_results.shape[0], "rules found.")
print()

# display results
#ca_dn_results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

### community area, location description

In [None]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = ca_ld
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
ca_ld_results = use_apriori(dataframe_to_analyze, minimum_support, minimum_confidence, minimum_lift)
print(ca_ld_results.shape[0], "rules found.")
print()

# display results
#ca_ld_results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

### community area, hour

In [None]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = ca_hr
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
ca_hr_results = use_apriori(dataframe_to_analyze, minimum_support, minimum_confidence, minimum_lift)
print(ca_hr_results.shape[0], "rules found.")
print()

# display results
#ca_hr_results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

### community area, location description, crime code category

In [None]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = ld_ca_ccc
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
ld_ca_ccc_results = use_apriori(dataframe_to_analyze, minimum_support, minimum_confidence, minimum_lift)
print(ld_ca_ccc_results.shape[0], "rules found.")
print()

# display results
#ld_ca_ccc_results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

### community area, crime against persons, crime against property, crime against society

In [None]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = ca_cape_capo_cas
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
ca_cape_capo_cas_results = use_apriori(dataframe_to_analyze, minimum_support, minimum_confidence, minimum_lift)
print(ca_cape_capo_cas_results.shape[0], "rules found.")
print()

# display results
#ca_cape_capo_cas_results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

### location description, crime against persons, crime against property, crime against society

In [None]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = ld_cape_capo_cas
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
ld_cape_capo_cas_results = use_apriori(dataframe_to_analyze, minimum_support, minimum_confidence, minimum_lift)
print(ld_cape_capo_cas_results.shape[0], "rules found.")
print()

# display results
#ld_cape_capo_cas_results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

### weekend, crime against persons, crime against property, crime against society

In [None]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = wkn_cape_capo_cas
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
wkn_cape_capo_cas_results = use_apriori(dataframe_to_analyze, minimum_support, minimum_confidence, minimum_lift)
print(wkn_cape_capo_cas_results.shape[0], "rules found.")
print()

# display results
#wkn_cape_capo_cas_results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

### hour, crime against persons, crime against property, crime against society

In [None]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = hr_cape_capo_cas
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
hr_cape_capo_cas_results = use_apriori(dataframe_to_analyze, minimum_support, minimum_confidence, minimum_lift)
print(hr_cape_capo_cas_results.shape[0], "rules found.")
print()

# display results
#hr_cape_capo_cas_results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

### arrest, domestic crime, index crime (more serious crime), violent crime

In [None]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = arr_dom_idx_vc
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
arr_dom_idx_vc_results = use_apriori(dataframe_to_analyze, minimum_support, minimum_confidence, minimum_lift)
print(arr_dom_idx_vc_results.shape[0], "rules found.")
print()

# display results
#arr_dom_idx_vc_results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

### community area, crime against persons, crime against property, crime against society

In [None]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = ca_cape_capo_cas_arr
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
ca_cape_capo_cas_arr_results = use_apriori(dataframe_to_analyze, minimum_support, minimum_confidence, minimum_lift)
print(ca_cape_capo_cas_arr_results.shape[0], "rules found.")
print()

# display results
#ca_cape_capo_cas_arr_results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

### community area, arrest, domestic crime, index crime, violent crime. 

In [None]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = ca_arr_dom_idx_vc
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
ca_arr_dom_idx_vc_results = use_apriori(dataframe_to_analyze, minimum_support, minimum_confidence, minimum_lift)
print(ca_arr_dom_idx_vc_results.shape[0], "rules found.")
print()

# display results
#ca_arr_dom_idx_vc_results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

### crime code category, hour, year, month, day of the week (integers), district

In [None]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = ccc_hr_yr_mon_dow_dc
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
ccc_hr_yr_mon_dow_dc_results = use_apriori(dataframe_to_analyze, minimum_support, minimum_confidence, minimum_lift)
print(ccc_hr_yr_mon_dow_dc_results.shape[0], "rules found.")
print()

# display results
#ccc_hr_yr_mon_dow_dc_results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

### crime code category, hour, year, month, day of the week, ward

In [None]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = ccc_hr_yr_mon_dow_wd
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
ccc_hr_yr_mon_dow_wd_results = use_apriori(dataframe_to_analyze, minimum_support, minimum_confidence, minimum_lift)
print(ccc_hr_yr_mon_dow_wd_results.shape[0], "rules found.")
print()

# display results
#ccc_hr_yr_mon_dow_wd_results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

### crime code category, hour, year, month, day of week, community area

In [None]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = ccc_hr_yr_mon_dow_ca
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
ccc_hr_yr_mon_dow_ca_results = use_apriori(dataframe_to_analyze, minimum_support, minimum_confidence, minimum_lift)
print(ccc_hr_yr_mon_dow_ca_results.shape[0], "rules found.")
print()

# display results
#ccc_hr_yr_mon_dow_ca_results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

# Conclusion
Given the wide range of results, it would be impossible to fully interpret the results we have here. However, we can highlight some of the more interesting observations. In some cases the reason behind what we have found is immediately apparent, and the results only serve to confirm things we already know. Some insights raise interesting questions that could, and perhaps should, be explored further.

### Observations:


### Future work:
Many of the results we have found can be built upon by downsampling the data to explore subsets of the data rows (not just columns) or by applying other data mining methodologies given the knowledge we obtained here.

# SANDBOX
Use this space to apply apriori method to any one of the data subsets we have provided or any that you would prefer to define yourself. Tweak the parameters to explore different results and make judgements on their validity based on what you know about the inputs as they relate to the data. 