In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from efficient_apriori import apriori

%matplotlib inline 

In [2]:
#import dataset
dataset_raw = pd.read_csv("./divar_dataset/divar_posts_dataset.csv")

In [3]:
 dataset_raw["cat2"].value_counts()

furniture-and-home-decore        204445
cars                             130443
clothing-and-shoes                87096
mobile-tablet                     76307
utensils-and-appliances           58714
parts-accessories                 55986
animals                           50694
equipments-and-machinery          50101
game-consoles-and-video-games     31964
audio-video                       29176
computers                         26536
baby-and-toys                     20502
motorcycles                       19831
building-and-garden               18915
batch                             11777
childrens-clothing-and-shoe       11433
bicycle                           11283
sport-leisure                     10374
jewelry-and-watches               10316
health-beauty                      9390
utility                            8239
musical-instruments                3854
book-student-literature            3203
hobby-collectibles                 2431
phone                              1193


In [4]:
 dataset_raw["cat3"].value_counts()

light               120451
mobile-phones        62176
clothing             53758
sofa-armchair        49363
birds                38511
                     ...  
cat                    472
traditional            472
coin-stamp             388
drums-percussion       354
repair-tool            346
Name: cat3, Length: 66, dtype: int64

In [5]:
city_values_counts = dataset_raw["city"].value_counts()
print("city_values_counts:\n"+str(city_values_counts))
min_city_count = city_values_counts[-1]
print("min value count : "+str(min_city_count))

city_values_counts:
Tehran        442938
Mashhad       138879
Shiraz         83970
Karaj          83736
Isfahan        61803
Ahvaz          47163
Tabriz         34014
Qom            32002
Kermanshah     23130
Name: city, dtype: int64
min value count : 23130


# PART 1 : solving using cat2

In [6]:
dataset = dataset_raw[["city","cat1","cat2","cat3"]]
dataset.isnull().sum()

city         0
cat1         0
cat2      1758
cat3    172334
dtype: int64

In [7]:

dataset = dataset.dropna()
dataset = dataset.astype(str)
dataset["product"] = dataset['cat2'] 
dataset = dataset[["city","product"]]

dataset

Unnamed: 0,city,product
0,Tehran,furniture-and-home-decore
1,Mashhad,furniture-and-home-decore
2,Mashhad,cars
3,Tehran,furniture-and-home-decore
4,Karaj,baby-and-toys
...,...,...
947630,Shiraz,building-and-garden
947631,Tehran,audio-video
947632,Shiraz,cars
947633,Mashhad,mobile-tablet


In [8]:
dataset["product"].value_counts()

furniture-and-home-decore    203471
cars                         130443
clothing-and-shoes            87096
mobile-tablet                 76307
animals                       50694
equipments-and-machinery      49580
utensils-and-appliances       48057
audio-video                   28755
computers                     26536
baby-and-toys                 20502
building-and-garden           17635
sport-leisure                 10374
jewelry-and-watches            9539
utility                        7733
musical-instruments            3292
book-student-literature        3203
hobby-collectibles             2084
Name: product, dtype: int64

In [9]:
grouped = dataset.groupby("city")
cities_items = []
for name, group in grouped:
    cities_items += group.sample(frac=1)[0:min_city_count].values.tolist()


In [10]:
len(cities_items)

203107

In [11]:

transactions = cities_items
itemsets, rules = apriori(transactions, min_support=0.012, min_confidence=0.2)

In [12]:
itemsets[2]

{('Ahvaz', 'mobile-tablet'): 2925,
 ('Ahvaz', 'furniture-and-home-decore'): 6925,
 ('Ahvaz', 'cars'): 3388,
 ('Isfahan', 'furniture-and-home-decore'): 4975,
 ('Isfahan', 'cars'): 4573,
 ('Isfahan', 'animals'): 3008,
 ('Isfahan', 'mobile-tablet'): 2863,
 ('Karaj', 'cars'): 3791,
 ('Karaj', 'furniture-and-home-decore'): 6755,
 ('Kermanshah', 'mobile-tablet'): 2956,
 ('Kermanshah', 'furniture-and-home-decore'): 5323,
 ('Kermanshah', 'cars'): 3003,
 ('Kermanshah', 'clothing-and-shoes'): 3655,
 ('Mashhad', 'animals'): 2487,
 ('Mashhad', 'furniture-and-home-decore'): 5395,
 ('Mashhad', 'cars'): 3258,
 ('Mashhad', 'clothing-and-shoes'): 3233,
 ('Qom', 'furniture-and-home-decore'): 5861,
 ('Qom', 'clothing-and-shoes'): 4335,
 ('Qom', 'mobile-tablet'): 3207,
 ('Qom', 'animals'): 3429,
 ('Shiraz', 'cars'): 6579,
 ('Shiraz', 'furniture-and-home-decore'): 4200,
 ('Tabriz', 'clothing-and-shoes'): 3692,
 ('Tabriz', 'furniture-and-home-decore'): 5882,
 ('Tabriz', 'cars'): 4368,
 ('Tabriz', 'mobile-ta

In [13]:

for rule in sorted(rules, key=lambda rule: rule.lift, reverse=True):
  print(rule)

{animals} -> {Qom} (conf: 0.212, supp: 0.017, lift: 1.857, conv: 1.124)
{Shiraz} -> {cars} (conf: 0.284, supp: 0.032, lift: 1.690, conv: 1.162)
{Kermanshah} -> {clothing-and-shoes} (conf: 0.202, supp: 0.018, lift: 1.657, conv: 1.101)
{Ahvaz} -> {furniture-and-home-decore} (conf: 0.299, supp: 0.034, lift: 1.169, conv: 1.062)
{Kermanshah} -> {furniture-and-home-decore} (conf: 0.295, supp: 0.026, lift: 1.150, conv: 1.055)
{Karaj} -> {furniture-and-home-decore} (conf: 0.292, supp: 0.033, lift: 1.140, conv: 1.051)
{Tehran} -> {furniture-and-home-decore} (conf: 0.290, supp: 0.033, lift: 1.131, conv: 1.047)
{Tabriz} -> {furniture-and-home-decore} (conf: 0.254, supp: 0.029, lift: 0.993, conv: 0.998)
{Qom} -> {furniture-and-home-decore} (conf: 0.253, supp: 0.029, lift: 0.989, conv: 0.996)
{Mashhad} -> {furniture-and-home-decore} (conf: 0.233, supp: 0.027, lift: 0.911, conv: 0.970)
{Isfahan} -> {furniture-and-home-decore} (conf: 0.215, supp: 0.024, lift: 0.840, conv: 0.948)


# PART 1 : solving using cat3

In [14]:
dataset = dataset_raw[["city","cat3","cat2"]]
dataset = dataset.dropna()
dataset = dataset.astype(str)
dataset["product"] = dataset['cat2'] +"__"+ dataset['cat3']
dataset = dataset[["city","product"]]

dataset

Unnamed: 0,city,product
0,Tehran,furniture-and-home-decore__sofa-armchair
1,Mashhad,furniture-and-home-decore__antiques-and-art
2,Mashhad,cars__heavy
3,Tehran,furniture-and-home-decore__sofa-armchair
4,Karaj,baby-and-toys__personal-toys
...,...,...
947630,Shiraz,building-and-garden__stove-and-heating
947631,Tehran,audio-video__camera-camcoders
947632,Shiraz,cars__light
947633,Mashhad,mobile-tablet__mobile-tablet-accessories


In [15]:
grouped = dataset.groupby("city")
cities_items = []
for name, group in grouped:
    cities_items += group.sample(frac=1)[0:min_city_count].values.tolist()
len(cities_items)

203107

In [16]:

transactions = cities_items
itemsets, rules = apriori(transactions, min_support=0.010, min_confidence=0.15)

In [17]:
itemsets[2:]

TypeError: unhashable type: 'slice'

In [None]:

for rule in sorted(rules, key=lambda rule: rule.lift, reverse=True):
  print(rule)
