In [2]:
# Std lib:
import warnings
import os

# apriori algorithm lib
from apyori import apriori

# To hide environment variables
from dotenv import load_dotenv

# Data manipulation:
import numpy as np
import pandas as pd
import geopandas
from shapely.geometry import Point, Polygon

# Visualization:
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
%matplotlib inline
style.use('seaborn')

# Display all columns in Jupyter:
from IPython.display import display
pd.options.display.max_columns = None

# Filter Warnings
warnings.filterwarnings('ignore')

In [3]:
datatypes = {'block': 'object',
             'iucr': 'object',
             'primary_type': 'object',
             'description': 'object',
             'location_description': 'object',
             'arrest': 'bool',
             'domestic': 'bool',
             'beat': 'int64',
             'district': 'float64',
             'ward': 'float64',
             'community_area': 'int64',
             'fbi_code': 'object',
             'x_coordinate': 'float64',
             'y_coordinate': 'float64',
             'year': 'int64',
             'latitude': 'float64',
             'longitude': 'float64',
             'location': 'object',
             'month': 'uint8',
             'hour': 'uint8',
             'dayofweek': 'uint8',
             'weekend': 'bool',
             'CF': 'float64',
             'CF3': 'float64',
             'PI': 'float64',
             'EUH': 'float64',
             'CH': 'float64',
             'SOI': 'float64',
             'crime_code_category': 'category',
             'index_crime': 'bool',
             'violent_crime': 'bool',
             'property_crime': 'bool',
             'crime_against_persons': 'bool',
             'crime_against_property': 'bool',
             'crime_against_society': 'bool'}

In [4]:
%%time
load_dotenv()
path_to_data = os.environ.get('CLEAN_DATA')
df = pd.read_csv(path_to_data, dtype=datatypes)

CPU times: user 34.8 s, sys: 5.27 s, total: 40.1 s
Wall time: 43.6 s


### Filter out attributes that will not be considered for frequent itemset analysis

In [5]:
attributes = [ "crime_against_persons", "crime_against_property", "crime_against_society", \
              "location_description", "beat", "crime_code_category", \
              "domestic", "district", "ward", "community_area", \
              "month", "hour", "dayofweek", "weekend", "block"]
itemset = df[attributes]

### Remove extraneous address characters from 'block' attribute to validate its use in frequent itemset mining

In [6]:
%%time
itemset['block'] = itemset['block'].str[6:]
itemset['block'].head()

CPU times: user 1.85 s, sys: 132 ms, total: 1.98 s
Wall time: 1.99 s


0                        E 84TH ST
1                        W 57TH ST
2                         S MAY ST
3                  S BALTIMORE AVE
4    S DR MARTIN LUTHER KING JR DR
Name: block, dtype: object

### Convert column values to numbers to speed up frequent itemset mining

In [7]:
%%time
# BEWARE -- 12+ min to run this block. 
itemset_num = itemset.copy()
val_dict = {}
index = 0

for attribute in attributes:
    # Important! Reset so we don't replace vals with index# from other columns:
    num_dict = {}
    
    # Get all unique values for column:
    values = pd.unique(itemset[attribute])
    
    # For each unique value replace with an index no.
    for value in values:
        # add index no. as key to value for later reference:
        val_dict[index] = (attribute, value)
        num_dict[value] = index
        index += 1
    # Replace values with index key. MUST be done per column to avoid cross-pollination
    itemset_num[attribute] = itemset_num[attribute].replace(num_dict)

# verify results
itemset_num.head()

CPU times: user 11min 16s, sys: 51.5 s, total: 12min 7s
Wall time: 13min 28s


Unnamed: 0,crime_against_persons,crime_against_property,crime_against_society,location_description,beat,crime_code_category,domestic,district,ward,community_area,month,hour,dayofweek,weekend,block
0,0,2,4,6,220,523,549,551.0,575.0,625,703,715,739,746,748
1,0,2,4,7,221,523,549,552.0,576.0,626,703,716,740,746,749
2,0,2,4,8,222,523,549,553.0,577.0,627,703,717,741,746,750
3,0,2,4,7,223,523,549,551.0,578.0,625,703,718,741,746,751
4,0,2,4,9,224,523,550,554.0,579.0,628,703,719,742,747,752


### Preparing to identify association rules using apriori algorithm

To get an idea where to start with setting our support, confidence, & lift values it will help to know how many unique values each attribute contains. Lower numbers will likely lead to more matches, in particular when using the higher level boolean values. So, we will want to have higher support and confidence when calculating frequent itemsets using these attributes. 

In [8]:
for attribute in attributes:
    print("unique values in", attribute, ":", len(pd.unique(itemset[attribute])))

unique values in crime_against_persons : 2
unique values in crime_against_property : 2
unique values in crime_against_society : 2
unique values in location_description : 214
unique values in beat : 303
unique values in crime_code_category : 26
unique values in domestic : 2
unique values in district : 24
unique values in ward : 50
unique values in community_area : 78
unique values in month : 12
unique values in hour : 24
unique values in dayofweek : 7
unique values in weekend : 2
unique values in block : 3295


Because of the variety in our attributes, we will not want to perform our algorithm on just all of them. Instead we will want to define some subsets that might be logical, which we can evolve later based on our observations. Using what we know about the attribute values we define the subsets the at least initially appear the most salient here:

In [9]:
#####
bl_ld_ccc = itemset_num[["block", "location_description", "crime_code_category"]]
bt_ld_ccc = itemset_num[["beat", "location_description", "crime_code_category"]]
ca_ld_ccc = itemset_num[["community_area", "location_description", "crime_code_category"]]
#####

In order to use the apyori library, we will need to convert dataframes into list of lists using a function:

In [10]:
def convert_df_2_list(df):
    return df.to_numpy().tolist()

It will also help to have a function that applies the apriori class and returns the results

In [11]:
def use_apriori(records, supp, conf, lift):
    association_rules = apriori(records, min_support=supp, min_confidence=conf, min_lift=lift)
    association_results = list(association_rules)
    return association_results

The resulting "association rules" results are also a list of lists, where each inner item is an association rule made up of three parts: the items in the list, the support of the itemset, and finally the confidence and lift. It looks like this:

```
RelationRecord(items=frozenset({6, 527}), support=0.03052002207603202, ordered_statistics=[OrderedStatistic(items_base=frozenset({6}), items_add=frozenset({527}), confidence=0.3026005700505611, lift=3.1820141658957573), OrderedStatistic(items_base=frozenset({527}), items_add=frozenset({6}), confidence=0.3209350946469079, lift=3.1820141658957573)])
```

For that reason,  it will help to have a function that prints out the results in human readable form.

In [12]:
def print_results(association_results):    
    print("========================")
    print("No. of results:", len(association_results))
    print("========================")
    for rule in association_results:
        for item in rule[0]:
            # use num code to retrieve value from dict, created earlier
            print(val_dict[item][0],": ", val_dict[item][1])
        print("support = ", str(rule[1]))
        print("confidence = ", str(rule[2][0][2]))
        print("lift = ", str(rule[2][0][3]))
        print("------------------------")


Or, better yet, a function that converts the results to a dataframe. Either can be used while exploring different inputs and parameters during frequent pattern data mining. 

In [13]:
def df_results(association_results):
    
    dataframes = []
    for rule in association_results:
        data = {}
        for item in rule[0]:
            data[val_dict[item][0]] = val_dict[item][1]
        data['support'] = rule[1]
        data['confidence'] = rule[2][0][2]
        data['lift'] = rule[2][0][3]
        dataframe = pd.DataFrame.from_records([data])
        dataframes.append(dataframe)
    
    
    return pd.concat(dataframes, ignore_index=True, axis=0)

In [14]:
def df_results(association_results):
    
    dataframes = []
    for rule in association_results:
        data = {}
        
        # create a rule from results
        base = list(rule[2][0][0])
        base = [val_dict[b][1] for b in base]
        add = list(rule[2][0][1])
        add = val_dict[add[0]][1]
        data['rule'] = str(base) + ' ==> ' + str(add)

        # support, confidence, and lift of rule
        data['lift'] = rule[2][0][3]
        data['confidence'] = rule[2][0][2]
        data['support'] = rule[1]

        # components of rule
        for item in rule[0]:
            data[val_dict[item][0]] = val_dict[item][1]
        
        # convert rules to dataframe rows
        dataframe = pd.DataFrame.from_records([data])
        dataframes.append(dataframe)
    
        # concat rules as rows of single dataframe
    return pd.concat(dataframes, ignore_index=True, axis=0)

CITATION:
Regarding use of the apyori lib see our references here:
- https://github.com/ymoch/apyori
- https://stackabuse.com/association-rule-mining-via-apriori-algorithm-in-python/
- pydoc entry:<br>
    ```
    apyori.apriori = apriori(transactions, **kwargs)
        Executes Apriori algorithm and returns a RelationRecord generator.

        Arguments:
            transactions -- A transaction iterable object
                            (eg. [['A', 'B'], ['B', 'C']]).

        Keyword arguments:
            min_support -- The minimum support of relations (float).
            min_confidence -- The minimum confidence of relations (float).
            min_lift -- The minimum lift of relations (float).
            max_length -- The maximum length of the relation (integer).
    ```

### Applying Apriori

In [17]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = ca_ld_ccc
minimum_support = 0.0045
minimum_confidence = 0.2
minimum_lift = 3

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
records = convert_df_2_list(dataframe_to_analyze)
results = use_apriori(records, minimum_support, minimum_confidence, minimum_lift)
results = df_results(results)
print(results.shape[1], "rules found.")

#######

7 rules found.
CPU times: user 23.4 s, sys: 1.05 s, total: 24.4 s
Wall time: 25.7 s


In [18]:
####### display options

#display(results)
#results.sort_values(by='lift', ascending=False)
results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

Unnamed: 0,rule,lift,confidence,support,location_description,crime_code_category,community_area
6,['RESIDENCE-GARAGE'] ==> Burglary,8.856286,0.505816,0.009495,RESIDENCE-GARAGE,Burglary,
7,"['SIDEWALK', 25] ==> Drug Abuse",4.930013,0.46883,0.005493,SIDEWALK,Drug Abuse,25.0
5,['DEPARTMENT STORE'] ==> Larceny,3.866943,0.819444,0.010368,DEPARTMENT STORE,Larceny,
3,['Prostitution'] ==> STREET,3.19793,0.819378,0.007665,STREET,Prostitution,
0,['SIDEWALK'] ==> Drug Abuse,3.182014,0.302601,0.03052,SIDEWALK,Drug Abuse,
1,['Robbery'] ==> SIDEWALK,3.149042,0.31761,0.011989,SIDEWALK,Robbery,
2,['Motor Vehicle Theft'] ==> STREET,3.074678,0.787798,0.035736,STREET,Motor Vehicle Theft,
4,['GROCERY FOOD STORE'] ==> Larceny,3.025541,0.641142,0.008058,GROCERY FOOD STORE,Larceny,


# SANDBOX!
Use this space to apply apriori method with different parameters to find interesting information

In [30]:
%%time
####### final formula for frequent itemset mining

# adjustable parameters:
dataframe_to_analyze = ca_ld_ccc
minimum_support = 0.002
minimum_confidence = 0.2
minimum_lift = 1.75

# convert dataframe to consumable list of lists, apply apriori to identify frequent itemsets, and print results:
records = convert_df_2_list(dataframe_to_analyze)
results = use_apriori(records, minimum_support, minimum_confidence, minimum_lift)
results = df_results(results)
print(results.shape[0], "rules found.")

#######

56 rules found.
CPU times: user 32.5 s, sys: 1.04 s, total: 33.5 s
Wall time: 34.2 s


In [28]:
####### display options

#display(results)
#results.sort_values(by='lift', ascending=False)
results.sort_values(by=['lift', 'confidence', 'support'], ascending=False)

#######

Unnamed: 0,rule,lift,confidence,support,location_description,crime_code_category,community_area
55,"['DEPARTMENT STORE', 'Larceny'] ==> 32",9.125945,0.222044,0.002302,DEPARTMENT STORE,Larceny,32.0
29,['RESIDENCE-GARAGE'] ==> Burglary,8.856286,0.505816,0.009495,RESIDENCE-GARAGE,Burglary,
27,['DEPARTMENT STORE'] ==> 32,8.661014,0.210732,0.002666,DEPARTMENT STORE,,32.0
44,"[26, 'SIDEWALK'] ==> Drug Abuse",5.632302,0.535616,0.002154,SIDEWALK,Drug Abuse,26.0
42,"[23, 'SIDEWALK'] ==> Drug Abuse",5.215205,0.495951,0.002993,SIDEWALK,Drug Abuse,23.0
45,"[29, 'SIDEWALK'] ==> Drug Abuse",5.034658,0.478782,0.00264,SIDEWALK,Drug Abuse,29.0
43,"['SIDEWALK', 25] ==> Drug Abuse",4.930013,0.46883,0.005493,SIDEWALK,Drug Abuse,25.0
26,['DEPARTMENT STORE'] ==> Larceny,3.866943,0.819444,0.010368,DEPARTMENT STORE,Larceny,
20,['CHA PARKING LOT/GROUNDS'] ==> Drug Abuse,3.424438,0.325654,0.002248,CHA PARKING LOT/GROUNDS,Drug Abuse,
25,['DRUG STORE'] ==> Larceny,3.320268,0.703598,0.00314,DRUG STORE,Larceny,


# SCRATCH BELOW HERE

In [None]:
# consider converting support, confidence results to percentages. 