In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from efficient_apriori import apriori

%matplotlib inline 
  

In [2]:
#import dataset
dataset_raw = pd.read_csv("./divar_dataset/divar_posts_dataset.csv")

# PART 4

# PART 4: solving using cat2

In [3]:
dataset = dataset_raw[["platform","cat2"]]

In [4]:
dataset = dataset.astype(str)
dataset["product"] = dataset['cat2']
dataset = dataset[["product","platform"]]
dataset

Unnamed: 0,product,platform
0,furniture-and-home-decore,mobile
1,furniture-and-home-decore,mobile
2,cars,mobile
3,furniture-and-home-decore,mobile
4,baby-and-toys,mobile
...,...,...
947630,building-and-garden,web
947631,audio-video,mobile
947632,cars,mobile
947633,mobile-tablet,mobile


In [5]:
platform_values_counts = dataset["platform"].value_counts()
print("platform_values_counts:\n"+str(platform_values_counts))
min_platform_count = platform_values_counts[-1]
print("min value count : "+str(min_platform_count))

platform_values_counts:
mobile    874237
web        73398
Name: platform, dtype: int64
min value count : 73398


In [None]:
grouped = dataset.groupby("platform")
transactions = []
for name, group in grouped:
    transactions += group.sample(frac=1)[0:min_platform_count].values.tolist()
transactions

In [7]:
len(transactions)

146796

In [8]:
itemsets, rules = apriori(transactions, min_support=0.02, min_confidence=0.1)

In [9]:
frequent_itemsets = dict(sorted(itemsets[2].items(), key=lambda item: item[0], reverse=True))
frequent_itemsets

{('utensils-and-appliances', 'web'): 4863,
 ('parts-accessories', 'web'): 3481,
 ('mobile-tablet', 'web'): 5746,
 ('mobile', 'utensils-and-appliances'): 4518,
 ('mobile', 'parts-accessories'): 4305,
 ('mobile', 'mobile-tablet'): 6012,
 ('furniture-and-home-decore', 'web'): 15608,
 ('furniture-and-home-decore', 'mobile'): 15730,
 ('equipments-and-machinery', 'web'): 4681,
 ('equipments-and-machinery', 'mobile'): 3832,
 ('computers', 'web'): 4687,
 ('clothing-and-shoes', 'web'): 4875,
 ('clothing-and-shoes', 'mobile'): 6879,
 ('cars', 'web'): 9948,
 ('cars', 'mobile'): 10233,
 ('animals', 'mobile'): 4034}

In [10]:
for rule in sorted(rules, key=lambda rule: rule.lift, reverse=True):
  print(rule)

[{clothing-and-shoes} -> {mobile},
 {mobile-tablet} -> {mobile},
 {utensils-and-appliances} -> {mobile},
 {mobile} -> {cars},
 {cars} -> {mobile},
 {mobile} -> {furniture-and-home-decore},
 {furniture-and-home-decore} -> {mobile},
 {animals} -> {mobile},
 {equipments-and-machinery} -> {mobile},
 {parts-accessories} -> {mobile},
 {mobile-tablet} -> {web},
 {clothing-and-shoes} -> {web},
 {web} -> {furniture-and-home-decore},
 {furniture-and-home-decore} -> {web},
 {utensils-and-appliances} -> {web},
 {equipments-and-machinery} -> {web},
 {parts-accessories} -> {web},
 {computers} -> {web},
 {web} -> {cars},
 {cars} -> {web}]

# PART 4: solving using cat3

In [11]:
dataset = dataset_raw[["platform","cat2","cat3"]]

In [12]:
dataset = dataset.dropna()
dataset = dataset.astype(str)
dataset["product"] = dataset['cat2'] +"__"+ dataset['cat3']
dataset = dataset[["product","platform"]]
dataset

Unnamed: 0,product,platform
0,furniture-and-home-decore__sofa-armchair,mobile
1,furniture-and-home-decore__antiques-and-art,mobile
2,cars__heavy,mobile
3,furniture-and-home-decore__sofa-armchair,mobile
4,baby-and-toys__personal-toys,mobile
...,...,...
947630,building-and-garden__stove-and-heating,web
947631,audio-video__camera-camcoders,mobile
947632,cars__light,mobile
947633,mobile-tablet__mobile-tablet-accessories,mobile


In [13]:
platform_values_counts = dataset["platform"].value_counts()
print("platform_values_counts:\n"+str(platform_values_counts))
min_platform_count = platform_values_counts[-1]
print("min value count : "+str(min_platform_count))

platform_values_counts:
mobile    713766
web        61535
Name: platform, dtype: int64
min value count : 61535


In [None]:
grouped = dataset.groupby("platform")
transactions = []
for name, group in grouped:
    transactions += group.sample(frac=1)[0:min_platform_count].values.tolist()
transactions

In [15]:
len(transactions)

123070

In [16]:
itemsets, rules = apriori(transactions, min_support=0.02, min_confidence=0.1)

In [17]:
frequent_itemsets = dict(sorted(itemsets[2].items(), key=lambda item: item[0], reverse=True))
frequent_itemsets

{('mobile-tablet__mobile-phones', 'web'): 4530,
 ('mobile', 'mobile-tablet__mobile-phones'): 5002,
 ('furniture-and-home-decore__tables-and-chairs', 'web'): 2609,
 ('furniture-and-home-decore__sofa-armchair', 'web'): 3428,
 ('furniture-and-home-decore__sofa-armchair', 'mobile'): 3956,
 ('clothing-and-shoes__shoes-belt-bag', 'mobile'): 2750,
 ('clothing-and-shoes__clothing', 'web'): 2907,
 ('clothing-and-shoes__clothing', 'mobile'): 4366,
 ('cars__light', 'web'): 9597,
 ('cars__light', 'mobile'): 9451,
 ('animals__birds', 'mobile'): 3141}

In [19]:
for rule in sorted(rules, key=lambda rule: rule.lift, reverse=True):
  print(rule)

{animals__birds} -> {mobile} (conf: 0.701, supp: 0.026, lift: 1.402, conv: 1.673)
{clothing-and-shoes__clothing} -> {mobile} (conf: 0.600, supp: 0.035, lift: 1.201, conv: 1.251)
{clothing-and-shoes__shoes-belt-bag} -> {mobile} (conf: 0.583, supp: 0.022, lift: 1.166, conv: 1.199)
{furniture-and-home-decore__sofa-armchair} -> {mobile} (conf: 0.536, supp: 0.032, lift: 1.072, conv: 1.077)
{mobile-tablet__mobile-phones} -> {mobile} (conf: 0.525, supp: 0.041, lift: 1.050, conv: 1.052)
{furniture-and-home-decore__tables-and-chairs} -> {web} (conf: 0.524, supp: 0.021, lift: 1.047, conv: 1.049)
{web} -> {cars__light} (conf: 0.156, supp: 0.078, lift: 1.008, conv: 1.001)
{cars__light} -> {web} (conf: 0.504, supp: 0.078, lift: 1.008, conv: 1.008)
{mobile} -> {cars__light} (conf: 0.154, supp: 0.077, lift: 0.992, conv: 0.999)
{cars__light} -> {mobile} (conf: 0.496, supp: 0.077, lift: 0.992, conv: 0.992)
{mobile-tablet__mobile-phones} -> {web} (conf: 0.475, supp: 0.037, lift: 0.950, conv: 0.953)
{fur