### 2.1 Finding Co-occurrence Patterns of Points of Interest (POI)

Co-occurence: Occurence of different combinations of POI at the same time 

Idea: 

Given that we have Grid Cells (x,y) which denotes **what** and **how many** POI are in that grid, we can mine co-occurence that means: 

"What is the common occurance of **different** POIs that would likely appear in one grid"


### Illustrating with an example: 

Grid cell (x1, y1) has categories: restaurant, park, gym

Grid cell (x2, y2) has categories: restaurant, park

Grid cell (x3, y3) has categories: restaurant, cafe


Running Apriori with a minimum support threshold of 2 (meaning the combination should appear in at least 2 grid cells), 

Frequent itemset = {restaurant, park} because restaurant and park appear together in two grid cells ((x1, y1) and (x2, y2)).


### Terminologies 

- Transaction / Baskets = An entry in the database

- Itemset I = {Set of items that appear together}

- Support of Item set I = Number of transactions that contains Itemset I 

- Frequent Itemset = | I | >= minsup 

From frequent itemsets {A,B,C,D}, we can generate {association rules}: 

- Confidence of an association rule: P({current items} -> Item j) 


### Determining Frequent Itemsets 
```pseudo
if support(I) >= min_sup: 
    I = frequent_itemset: 
else:
    not frquenet 

```
### Determining Association Rules  

```pseudo
for all elements in frequent itemset {A, B...}:
    generate A->B 

    conf(A->B) = P(AB)/P(A) 

    if conf(A->B) > min_conf: 
        A->B = Association Rule 

    repeat for different implications (A->C, A->D... AB->C...) 



In [38]:
import sys
from itertools import chain, combinations
from collections import defaultdict
from optparse import OptionParser
import pandas as pd
import csv


### Reformating CSV 

- Treat each unique grid as a transaction / basket 

- For POIs in that unique grid, string all of them together => This becomes a unique transaction 

In [39]:
def preprocess_data(input_file, output_file):

    data = pd.read_csv(input_file)
    grouped = data.groupby(['x', 'y'])['category'].apply(list).reset_index()
    max_pois = grouped['category'].apply(len).max()
    poi_columns = grouped['category'].apply(lambda pois: [str(poi) for poi in pois] + [None]*(max_pois - len(pois)))
    poi_df = pd.DataFrame(poi_columns.tolist(), columns=[f'poi{i+1}' for i in range(max_pois)])
    result = pd.concat([grouped[['x', 'y']], poi_df], axis=1)
    result.to_csv(output_file, index=False, na_rep='')

In [40]:
def subsets(arr):
    """ Returns non empty subsets of arr"""
    return chain(*[combinations(arr, i + 1) for i, a in enumerate(arr)])


def returnItemsWithMinSupport(itemSet, transactionList, minSupport, freqSet):
    """calculates the support for items in the itemSet and returns a subset
    of the itemSet each of whose elements satisfies the minimum support"""
    _itemSet = set()
    localSet = defaultdict(int)

    for item in itemSet:
        for transaction in transactionList:
            if item.issubset(transaction):
                freqSet[item] += 1
                localSet[item] += 1

    for item, count in localSet.items():
        support = float(count) / len(transactionList)

        if support >= minSupport:
            _itemSet.add(item)

    return _itemSet


def joinSet(itemSet, length):
    """Join a set with itself and returns the n-element itemsets"""
    return set(
        [i.union(j) for i in itemSet for j in itemSet if len(i.union(j)) == length]
    )


def getItemSetTransactionList(data_iterator):
    transactionList = list()
    itemSet = set()
    for record in data_iterator:
        transaction = frozenset(record)
        transactionList.append(transaction)
        for item in transaction:
            itemSet.add(frozenset([item]))  # Generate 1-itemSets
    return itemSet, transactionList


def runApriori(data_iter, minSupport, minConfidence):
    """
    run the apriori algorithm. data_iter is a record iterator
    Return both:
     - items (tuple, support)
     - rules ((pretuple, posttuple), confidence)
    """
    itemSet, transactionList = getItemSetTransactionList(data_iter)

    freqSet = defaultdict(int)
    largeSet = dict()


    assocRules = dict()

    oneCSet = returnItemsWithMinSupport(itemSet, transactionList, minSupport, freqSet)

    currentLSet = oneCSet
    k = 2
    while currentLSet != set([]):
        largeSet[k - 1] = currentLSet
        currentLSet = joinSet(currentLSet, k)
        currentCSet = returnItemsWithMinSupport(
            currentLSet, transactionList, minSupport, freqSet
        )
        currentLSet = currentCSet
        k = k + 1

    def getSupport(item):
        """local function which Returns the support of an item"""
        return float(freqSet[item]) / len(transactionList)

    toRetItems = []
    for key, value in largeSet.items():
        toRetItems.extend([(tuple(item), getSupport(item)) for item in value])

    toRetRules = []
    for key, value in list(largeSet.items())[1:]:
        for item in value:
            _subsets = map(frozenset, [x for x in subsets(item)])
            for element in _subsets:
                remain = item.difference(element)
                if len(remain) > 0:
                    confidence = getSupport(item) / getSupport(element)
                    if confidence >= minConfidence:
                        toRetRules.append(((tuple(element), tuple(remain)), confidence))
    return toRetItems, toRetRules


def printResults(items, rules):
    """prints the generated itemsets sorted by support and the confidence rules sorted by confidence"""
    for item, support in sorted(items, key=lambda x: x[1]):
        print("item: %s , %.3f" % (str(item), support))
    print("\n------------------------ RULES:")
    for rule, confidence in sorted(rules, key=lambda x: x[1]):
        pre, post = rule
        print("Rule: %s ==> %s , %.3f" % (str(pre), str(post), confidence))


def to_str_results(items, rules):
    """prints the generated itemsets sorted by support and the confidence rules sorted by confidence"""
    i, r = [], []
    for item, support in sorted(items, key=lambda x: x[1]):
        x = "item: %s , %.3f" % (str(item), support)
        i.append(x)

    for rule, confidence in sorted(rules, key=lambda x: x[1]):
        pre, post = rule
        x = "Rule: %s ==> %s , %.3f" % (str(pre), str(post), confidence)
        r.append(x)

    return i, r


### Since the above code requires a csv in the form of 

Transaction 1 items

Transaction 2 items 
... 

Modify the csv to appear as such 

In [41]:
def prepare_data_for_apriori(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        next(infile)
        
        for line in infile:
            parts = line.strip().split(',')
        
            pois = [poi.strip('"') for poi in parts[2:] if poi]  
            pois_line = ','.join(pois)
            outfile.write(f"{pois_line}\n")


In [42]:
def dataFromFile(fname):
    """Function which reads from the file and yields a generator"""
    with open(fname, "r") as file_iter: 
        for line in file_iter:
            line = line.strip().rstrip(",")  
            record = frozenset(line.split(","))  
            yield record

def run_apriori_and_save_results(input_file, min_support, min_confidence, dataset_label):
    inFile = dataFromFile(input_file)
    items, rules = runApriori(inFile, min_support, min_confidence)
    

    with open(f'output/itemsets_{dataset_label}.csv', 'w', newline='') as items_file:
        writer = csv.writer(items_file)
        writer.writerow(['Itemset', 'Support'])
        for item, support in items:
            writer.writerow([', '.join(item), support])

    with open(f'output/rules_{dataset_label}.csv', 'w', newline='') as rules_file:
        writer = csv.writer(rules_file)
        writer.writerow(['Rule', 'Confidence'])
        for (pre, post), confidence in rules:
            rule = f"{', '.join(pre)} => {', '.join(post)}"
            writer.writerow([rule, confidence])


In [43]:
def main(input_files, min_support, min_confidence):
    for input_file in input_files:
        dataset_label = input_file.split('_')[-1].replace('.csv', '')  # Extract 'A', 'B', 'C', or 'D'
        
        transformed_file = f'data/transformed_{dataset_label}.csv'
        preprocess_data(input_file, transformed_file)
        
        prepared_file = f'data/prepared_data_{dataset_label}.csv'
        prepare_data_for_apriori(transformed_file, prepared_file)
        
        run_apriori_and_save_results(prepared_file, min_support, min_confidence, dataset_label)

input_files = [
    'data/POIdata_cityA.csv',
    'data/POIdata_cityB.csv',
    'data/POIdata_cityC.csv',
    'data/POIdata_cityD.csv'
]
min_support = 0.10
min_confidence = 0.3

main(input_files, min_support, min_confidence)
