# Mount to google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Install required package

In [None]:
!pip3 install efficient-apriori

Collecting efficient-apriori
  Downloading https://files.pythonhosted.org/packages/5a/c6/ecdf3a32d23cada466634c649cf4f50fefe76f56eae53ecceff688b306be/efficient_apriori-1.1.1-py3-none-any.whl
Installing collected packages: efficient-apriori
Successfully installed efficient-apriori-1.1.1


# Get the file

In [None]:
root_dir = '/content/drive/My Drive/Data Mining/'
base_dir = root_dir + 'Data Mining/Project/'
file = base_dir + 'cleandata_sales.xlsx'

train_file = base_dir + 'Store Sales data/train.csv'
stores_file = base_dir + 'Store Sales data/stores.csv'
features_file = base_dir + 'Store Sales data/features.csv'
test_file = base_dir + 'Store Sales data/test.csv'

# Import

In [None]:
import numpy as np
import pandas as pd
from efficient_apriori import apriori 
import matplotlib.pyplot as plt

# Prepare the data

In [None]:
# read files
# bef_nov_dataframe = pd.read_excel(file, sheet_name = 'before2011nov')
# aft_nov_dataframe = pd.read_excel(file, sheet_name = 'after2011nov')
# aft_nov_noCPI_dataframe = pd.read_excel(file, sheet_name = 'after2011nov_noCPI')
# aft_nov__CPI_dataframe = pd.read_excel(file, sheet_name = 'after2011nov_CPI')

# read files
train_dataframe = pd.read_csv(train_file)
stores_dataframe = pd.read_csv(stores_file)
features_dataframe = pd.read_csv(features_file)
test_dataframe = pd.read_csv(test_file)
features_dataframe = features_dataframe.drop(columns='IsHoliday')

# left join other tables
train_dataframe = pd.merge(train_dataframe, stores_dataframe, how='left', left_on='Store', right_on='Store')
train_dataframe = pd.merge(train_dataframe, features_dataframe,  how='left', left_on=['Store','Date'], right_on = ['Store','Date']) 

# # Drop MarkDown 1-5
# train_dataframe = train_dataframe.drop(columns=['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4','MarkDown5'])  

# file 0
train_dataframe = train_dataframe.fillna(0)


In [None]:
def splitByNumberClassesAndSetToMeanRange(dataframe, attribute, numberClasses):
  min = dataframe[attribute].min() 
  max = dataframe[attribute].max() 
  range_diff = (max - min) / numberClasses
  for i in range(numberClasses):
    lower_bound = min + range_diff * i
    higher_bound = min + range_diff * (i + 1)
    mean = round(min + (range_diff) * i + range_diff / 2, 2)
    dataframe.loc[(dataframe[attribute] >= lower_bound) & (dataframe[attribute] < higher_bound), attribute] = mean


def classifyNumericalAttribute(dataframe): 
  #classify numerical attribute
  #input: dataframe, attribute, number of classes
  splitByNumberClassesAndSetToMeanRange(dataframe, 'Temperature', 10)
  splitByNumberClassesAndSetToMeanRange(dataframe, 'Fuel_Price', 10)
  splitByNumberClassesAndSetToMeanRange(dataframe, 'MarkDown1', 1000)
  splitByNumberClassesAndSetToMeanRange(dataframe, 'MarkDown2', 1000)
  splitByNumberClassesAndSetToMeanRange(dataframe, 'MarkDown3', 1000)
  splitByNumberClassesAndSetToMeanRange(dataframe, 'MarkDown4', 1000)
  splitByNumberClassesAndSetToMeanRange(dataframe, 'MarkDown5', 1000)
  splitByNumberClassesAndSetToMeanRange(dataframe, 'CPI', 10)
  splitByNumberClassesAndSetToMeanRange(dataframe, 'Unemployment', 10)
  # splitByNumberClassesAndSetToMeanRange(dataframe, 'store size', 10)
  # splitByNumberClassesAndSetToMeanRange(dataframe, 'weekly sales', 10)
  splitByNumberClassesAndSetToMeanRange(dataframe, 'Size', 10)
  splitByNumberClassesAndSetToMeanRange(dataframe, 'Weekly_Sales', 10)

  #convert type of attribute to string, the apriori method only allow string and bool
  attributeList = dataframe.columns
  total_rows = dataframe.shape[0]

  for i in attributeList:
    dataframe[i]= dataframe[i].astype(str)
    for j in range(total_rows):
      temp = dataframe.loc[j, i]
      dataframe.loc[j, i] = i + ' ' + temp


def splitByFrequency(dataframe, attribute, num):
  dataframe['new'] = attribute + ': '
  dataframe['cuts'] = (pd.qcut(dataframe[attribute].tolist(), num, duplicates='drop').codes).astype(str)
  dataframe[attribute] = dataframe['new'] + dataframe['cuts']
  dataframe.drop(columns=['new', 'cuts'])
  return dataframe

def classifyNumericalAttributeByFrequency(dataframe, dictionary):
  attributeList = dataframe.columns
  i = 0
  for attri in attributeList:
    if attri == 'Store' or attri == 'Dept' or attri == 'IsHoliday' or attri == 'Type' or attri == 'Date':
      dataframe['new'] = attri + ': '
      dataframe['cuts'] = dataframe[attri].astype(str)
      dataframe[attri] = dataframe['new'] + dataframe['cuts']
      dataframe = dataframe.drop(columns=['new', 'cuts'])
    else:
      dataframe  = splitByFrequency(dataframe, attri, dictionary[attri])
      dataframe = dataframe.drop(columns=['new', 'cuts'])
    i += 1
    # dataframe = dataframe.drop(columns=['new', 'cuts'])
  return dataframe




# Implement Apriori

In [None]:
# split by equally value
def apriori_algorithm(dataframe, support_rate, confidence_rate):
  classifyNumericalAttribute(dataframe)
  # Convert dataframe to list of tuples in order to run the apriori method.
  transactions_from_df = [tuple(row) for row in dataframe.values.tolist()] 
  itemsets, rules = apriori(transactions_from_df, min_support=support_rate, min_confidence=confidence_rate)
  return itemsets, rules

In [None]:
# split by equally quantity
def apriori_algorithm_quantity(dataframe, support_rate, confidence_rate):
  numDict ={}
  numDict['Temperature'] = 10
  numDict['Fuel_Price'] = 10
  numDict['MarkDown1'] = 10
  numDict['MarkDown2'] = 10
  numDict['MarkDown3'] = 10
  numDict['MarkDown4'] = 10
  numDict['MarkDown5'] = 10
  numDict['CPI'] = 10
  numDict['Unemployment'] = 10
  numDict['Size'] = 10
  numDict['Weekly_Sales'] = 10

  df = classifyNumericalAttributeByFrequency(train_dataframe, numDict)
  # Convert dataframe to list of tuples in order to run the apriori method.
  transactions_from_df = [tuple(row) for row in df.values.tolist()] 
  itemsets, rules = apriori(transactions_from_df, min_support=support_rate, min_confidence=confidence_rate)
  return itemsets, rules

# Run Apriori

In [None]:
# Please use a copied framework as the parameter for the apriori_algorithm
# bef_nov_dataframe_copy = bef_nov_dataframe.copy()
# aft_nov_dataframe_copy = aft_nov_dataframe.copy()
# aft_nov_noCPI_dataframe_copy = aft_nov_noCPI_dataframe.copy()
# aft_nov__CPI_dataframe_copy = aft_nov__CPI_dataframe.copy()

train_dataframe_copy = train_dataframe.copy()

#please put the copied framework in the first parameter.
itemsets, rules = apriori_algorithm(train_dataframe_copy, 0.5, 0.3)



In [None]:
train_dataframe_copy = train_dataframe.copy()

itemsets_q, rules_q = apriori_algorithm_quantity(train_dataframe_copy, 0.01, 0.1)

# Result

In [None]:
rules_rhs = filter(lambda rule: len(rule.rhs) == 1 and rule.rhs[0].find('Weekly_Sales') != -1, rules_q)
for rule in sorted(rules_rhs, key=lambda rule: rule.lift, reverse=True):
  print(rule)

{Dept: 54} -> {Weekly_Sales: 0} (conf: 0.916, supp: 0.010, lift: 9.163, conv: 10.753)
{Dept: 38, IsHoliday: False} -> {Weekly_Sales: 9} (conf: 0.773, supp: 0.011, lift: 7.728, conv: 3.961)
{Dept: 38} -> {Weekly_Sales: 9} (conf: 0.768, supp: 0.012, lift: 7.681, conv: 3.882)
{Dept: 95, IsHoliday: False} -> {Weekly_Sales: 9} (conf: 0.718, supp: 0.010, lift: 7.181, conv: 3.193)
{Dept: 95} -> {Weekly_Sales: 9} (conf: 0.714, supp: 0.011, lift: 7.144, conv: 3.151)
{Dept: 92} -> {Weekly_Sales: 9} (conf: 0.687, supp: 0.010, lift: 6.866, conv: 2.871)
{Size: 0, Type: A} -> {Weekly_Sales: 0} (conf: 0.354, supp: 0.011, lift: 3.538, conv: 1.393)
{CPI: 1, Size: 0} -> {Weekly_Sales: 0} (conf: 0.353, supp: 0.010, lift: 3.531, conv: 1.391)
{IsHoliday: False, Size: 1, Type: C} -> {Weekly_Sales: 0} (conf: 0.326, supp: 0.010, lift: 3.264, conv: 1.336)
{Size: 1, Type: C} -> {Weekly_Sales: 0} (conf: 0.326, supp: 0.011, lift: 3.257, conv: 1.335)
{IsHoliday: False, Type: C} -> {Weekly_Sales: 0} (conf: 0.320, s

In [None]:
rules_rhs = filter(lambda rule: len(rule.lhs) == 1 and len(rule.rhs) == 1, rules)
for rule in sorted(rules_rhs, key=lambda rule: rule.lift):
  print(rule)


{MarkDown2 nan} -> {store type A} (conf: 0.359, supp: 0.103, lift: 0.734, conv: 0.797)
{Fuel_Price 3.53} -> {MarkDown3 41.73} (conf: 0.635, supp: 0.107, lift: 0.917, conv: 0.842)
{store type A} -> {MarkDown3 41.73} (conf: 0.639, supp: 0.312, lift: 0.923, conv: 0.852)
{MarkDown3 41.73} -> {store type A} (conf: 0.451, supp: 0.312, lift: 0.923, conv: 0.931)
{store size 210384.65} -> {MarkDown3 41.73} (conf: 0.651, supp: 0.159, lift: 0.940, conv: 0.880)
{Temperature 49.13} -> {MarkDown3 41.73} (conf: 0.656, supp: 0.106, lift: 0.947, conv: 0.894)
{Temperature 49.13} -> {IsHoliday False} (conf: 0.879, supp: 0.142, lift: 0.954, conv: 0.648)
{Unemployment 7.93} -> {MarkDown3 41.73} (conf: 0.663, supp: 0.126, lift: 0.957, conv: 0.912)
{Unemployment 7.03} -> {MarkDown3 41.73} (conf: 0.664, supp: 0.177, lift: 0.960, conv: 0.917)
{MarkDown2 -3.8} -> {MarkDown3 41.73} (conf: 0.670, supp: 0.133, lift: 0.968, conv: 0.933)
{store type A} -> {CPI 134.69} (conf: 0.409, supp: 0.200, lift: 0.969, conv: 0.

In [None]:
rules_rhs = filter(lambda rule: len(rule.lhs) == 1 and len(rule.rhs) == 1, rules)
for rule in sorted(rules_rhs, key=lambda rule: rule.lift):
  print(rule)

In [None]:
rules_rhs = filter(lambda rule: len(rule.rhs) == 1 and rule.rhs[0].find('sale') != -1, rules)
for rule in sorted(rules_rhs, key=lambda rule: rule.lift, reverse=True):
  print(rule)

{CPI 222.36, store size 44112.35} -> {weekly sales 388410.7} (conf: 0.992, supp: 0.107, lift: 4.036, conv: 93.150)
{store size 44112.35} -> {weekly sales 388410.7} (conf: 0.835, supp: 0.186, lift: 3.399, conv: 4.579)
{IsHoliday False, store size 44112.35} -> {weekly sales 388410.7} (conf: 0.834, supp: 0.171, lift: 3.394, conv: 4.545)
{MarkDown2 nan, store size 44112.35} -> {weekly sales 388410.7} (conf: 0.807, supp: 0.104, lift: 3.283, conv: 3.904)
{IsHoliday False, MarkDown3 41.73, store size 44112.35} -> {weekly sales 388410.7} (conf: 0.791, supp: 0.112, lift: 3.218, conv: 3.605)
{MarkDown3 41.73, store size 44112.35} -> {weekly sales 388410.7} (conf: 0.790, supp: 0.120, lift: 3.216, conv: 3.596)
{MarkDown4 nan, store size 44112.35} -> {weekly sales 388410.7} (conf: 0.771, supp: 0.108, lift: 3.137, conv: 3.292)
{IsHoliday False, MarkDown4 nan, store size 44112.35} -> {weekly sales 388410.7} (conf: 0.770, supp: 0.102, lift: 3.132, conv: 3.276)
{IsHoliday False, MarkDown4 nan} -> {week