In [12]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [13]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 1000)

## Load Data

In [14]:
dataset = pd.read_csv('../datasets/BestBooksEverClean_train_dataset.csv', index_col=None)
dataset

Unnamed: 0,title,series,author,genres,pages,publishYear,rating,likedPercent,price
0,"Lean Hospitals: Improving Quality, Patient Saf...",standalone,Mark Graban,['Nonfiction'],252,2008,4.01,97.0,4.95
1,To Have and Have Not,standalone,Ernest Hemingway,"['Fiction', 'Classics', 'Literature', 'Novels'...",176,1999,3.55,87.0,8.49
2,Acacia: The War with the Mein,Acacia,David Anthony Durham,"['Fantasy', 'Fiction', 'Epic Fantasy', 'High F...",576,2007,3.56,85.0,4.36
3,Dismantled,standalone,Jennifer McMahon,"['Mystery', 'Fiction', 'Thriller', 'Suspense',...",422,2009,3.62,87.0,3.93
4,The Rabbit Factory,standalone,Larry Brown,"['Fiction', 'Southern', 'Audiobook', 'Contempo...",352,2003,3.57,86.0,6.09
...,...,...,...,...,...,...,...,...,...
31802,The Complete Collected Poems,standalone,Maya Angelou,"['Poetry', 'Classics', 'Feminism', 'Nonfiction...",273,1994,4.41,97.0,7.79
31803,The Same Old Story,standalone,Ivan Goncharov,"['Russia', 'Classics', 'Russian Literature', '...",392,2001,4.18,96.0,36.26
31804,Love Walked In,Love Walked In,Marisa de los Santos,"['Fiction', 'Chick Lit', 'Romance', 'Contempor...",307,2006,3.72,90.0,0.85
31805,Geek Love,standalone,Katherine Dunn,"['Fiction', 'Horror', 'Fantasy', 'Contemporary...",348,2002,3.95,90.0,5.38


## Categorize Numeric Cols

In [15]:
dataset['rating_cat'] = pd.cut(dataset["rating"], bins=[0, 2, 2.5, 3, 4, 4.5, 5], labels=['Muy Malo', 'Malo', 'Mediocre', 'Bueno', 'Muy Bueno', 'Excelente'])
dataset['pages_cat'] = pd.cut(dataset["pages"], bins=[0, 150, 250, 500, 1000, float('Inf')], labels=['Corto', 'Medio', 'Moderado', 'Largo', 'Muy Largo'])
dataset['price_cat'] = pd.cut(dataset["price"], bins=[0, 5, 10, 20, 30, float('Inf')], labels=['Muy Barato', 'Barato', 'Buen Precio', 'Caro', 'Muy Caro'])
dataset['publishDecade'] = (dataset['publishYear'].astype(int) // 10) * 10
# dataset

## From String list, return the list of genres

In [16]:
def genres_to_list(genres: str) -> list:
  list_of_genres = genres.removeprefix('[').removesuffix(']').split(', ')
  return list(map(lambda s: s[1:][:-1], list_of_genres))

In [17]:
dataset["genres"] = dataset["genres"].apply(genres_to_list)

## Make the transactions that would like to compare

In [18]:
transactions = []
for _, row in dataset.iterrows():
    # transaction = set([row['author']] + row['genres'] + [str(row['publishDecade']), str(row['pages_cat']), str(row['price_cat']), str(row['rating_cat'])])
    # transactions.append(list(transaction))
    transactions.append(row['genres'])
transactions

[['Nonfiction'],
 ['Fiction',
  'Classics',
  'Literature',
  'Novels',
  'American',
  '20th Century',
  'Literary Fiction',
  'Classic Literature',
  'The United States Of America',
  'Modern Classics'],
 ['Fantasy',
  'Fiction',
  'Epic Fantasy',
  'High Fantasy',
  'Science Fiction Fantasy',
  'Science Fiction',
  'Epic',
  'Speculative Fiction',
  'War',
  'Audiobook'],
 ['Mystery',
  'Fiction',
  'Thriller',
  'Suspense',
  'Mystery Thriller',
  'Crime',
  'Adult',
  'Horror',
  'Adult Fiction',
  'Ghosts'],
 ['Fiction',
  'Southern',
  'Audiobook',
  'Contemporary',
  'Humor',
  'Literature',
  'Dogs',
  'Southern Gothic'],
 ['Africa', 'Botswana', 'Mystery', 'Fiction', 'The World', 'Crime'],
 ['Young Adult',
  'Science Fiction',
  'Dystopia',
  'Fantasy',
  'Fiction',
  'Adventure',
  'Romance',
  'Action',
  'Futuristic',
  'War'],
 ['Art',
  'Nonfiction',
  'Philosophy',
  'France',
  'Essays',
  'Theory',
  'Poetry',
  'French Literature',
  'Art History',
  'Classics'],
 ['F

## Run Association Rules with the current Data

In [19]:
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [20]:
# Codificación de las transacciones
encoder = TransactionEncoder()
encoder_array = encoder.fit_transform(transactions)
df = pd.DataFrame(encoder_array, columns=encoder.columns_)

# Aplicar Apriori
frequent_itemsets = apriori(df, min_support=0.3, use_colnames=True)
frequent_itemsets

# Reglas de asociación
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)
rules.sort_values("confidence", ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1,(Fantasy),(Fiction),0.336341,0.721822,0.303235,0.90157,1.24902,0.060457,2.826159,0.300414
0,(Fiction),(Fantasy),0.721822,0.336341,0.303235,0.420097,1.24902,0.060457,1.14443,0.716708


In [21]:
df.sum().sort_values(ascending=False)

Fiction         22959
Fantasy         10698
Romance          9237
Young Adult      8330
Contemporary     6342
                ...  
Museology           1
Muslimah            1
NSFW                1
Namibia             1
漫画                  1
Length: 944, dtype: int64

In [22]:
# rules[(rules['confidence'] < 1.0)].sort_values("confidence", ascending=False)
rules['antecedent_len'] = rules['antecedents'].apply(lambda x: len(x))
filtered_rules = rules[rules['antecedent_len'] >= 3]
filtered_rules[filtered_rules['confidence'] < 1].sort_values("confidence", ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,antecedent_len
