In [1]:
# import pandas and mlxtend libraries
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
# set display options for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)

In [4]:
# read the dataset
df = pd.read_csv('armut_data.csv')

In [5]:
# read the dataset and check its basic properties using the check_df() function
# the function prints the shape, data types, head and tail of the dataframe, 
# number of missing values for each column, and various summary statisticsdef check_df(dataframe, head=5):
    print('##################### Shape #####################')
    print(dataframe.shape)
    print('##################### Types #####################')
    print(dataframe.dtypes)
    print('##################### Head #####################')
    print(dataframe.head(head))
    print('##################### Tail #####################')
    print(dataframe.tail(head))
    print('##################### NA #####################')
    print(dataframe.isnull().sum())
    print('##################### Quantiles #####################')
    print(dataframe.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

check_df(df)

##################### Shape #####################
(162523, 4)
##################### Types #####################
UserId         int64
ServiceId      int64
CategoryId     int64
CreateDate    object
dtype: object
##################### Head #####################
   UserId  ServiceId  CategoryId           CreateDate
0   25446          4           5  2017-08-06 16:11:00
1   22948         48           5  2017-08-06 16:12:00
2   10618          0           8  2017-08-06 16:13:00
3    7256          9           4  2017-08-06 16:14:00
4   25446         48           5  2017-08-06 16:16:00
##################### Tail #####################
        UserId  ServiceId  CategoryId           CreateDate
162518   10591         25           0  2018-08-06 14:40:00
162519   10591          2           0  2018-08-06 14:43:00
162520   10591         31           6  2018-08-06 14:47:00
162521   12666         38           4  2018-08-06 16:01:00
162522   17497         47           7  2018-08-06 16:04:00
##############

In [6]:
# create a new column that combines the service and category ids
df['ServiceCategory'] = df['ServiceId'].astype(str) + '_' + df['CategoryId'].astype(str)

In [7]:
# convert the create date column to datetime and extract month-year information
df['New_Date'] = pd.to_datetime(df['CreateDate']).dt.to_period('M')

In [10]:
# create a basket id column by combining user id and month-year information
df['basketID'] = df['UserId'].astype(str) + '_' + df['New_Date'].astype(str)

In [11]:
df.head()

Unnamed: 0,UserId,ServiceId,CategoryId,CreateDate,ServiceCategory,New_Date,basketID
0,25446,4,5,2017-08-06 16:11:00,4_5,2017-08,25446_2017-08
1,22948,48,5,2017-08-06 16:12:00,48_5,2017-08,22948_2017-08
2,10618,0,8,2017-08-06 16:13:00,0_8,2017-08,10618_2017-08
3,7256,9,4,2017-08-06 16:14:00,9_4,2017-08,7256_2017-08
4,25446,48,5,2017-08-06 16:16:00,48_5,2017-08,25446_2017-08


In [12]:
# group the data by basket id and service-category, and count the occurrences
# transform the data to a binary matrix, where 1 indicates the presence of an item in a basket, 0 otherwise
basket = df.groupby(['basketID', 'ServiceCategory'])['ServiceCategory'].count().unstack().fillna(0).applymap(lambda x: 1 if x > 0 else 0)

In [18]:
basket.head().iloc[:5,:10]

ServiceCategory,0_8,10_9,11_11,12_7,13_11,14_7,15_1,16_8,17_5,18_4
basketID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0_2017-08,0,0,0,0,0,0,0,0,0,0
0_2017-09,0,0,0,0,0,0,0,0,0,0
0_2018-01,0,0,0,0,0,0,0,0,0,0
0_2018-04,0,0,0,0,0,1,0,0,0,0
10000_2017-08,0,0,0,0,0,0,0,0,0,0


In [19]:
# apply apriori algorithm to find frequently occurring itemsets with minimum support of 0.01
frequent_itemsets = apriori(basket, min_support=0.01, use_colnames=True)



In [21]:
# apply association rules algorithm to find rules between frequently occurring itemsets with minimum support of 0.01
rules = association_rules(frequent_itemsets, metric='support', min_threshold=0.01)

In [22]:
# define a function that returns the recommended itemsets for a given service-category combination
# the function sorts the rules by lift, and returns the consequent itemset(s) for the given antecedent itemset
def arl_recommender(rules_df, ServiceCategory, rec_count=1):
    sorted_rules = rules_df.sort_values('lift', ascending=False)
    recommendation_list = []
    for i, product in enumerate(sorted_rules['antecedents']):
        for j in list(product):
            if j == ServiceCategory:
                recommendation_list.append(list(sorted_rules.iloc[i]['consequents']))

    return recommendation_list[0:rec_count]

In [23]:
# use the arl_recommender function to get the recommended itemset for two service-category combinations
arl_recommender(rules, '2_0', 1)

[['22_0']]

In [28]:
service = input('Write down a service category')
arl_recommender(rules, service, 1)

Write down a service category25_0


[['22_0']]