In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 1000)

## Load Data

In [3]:
dataset = pd.read_csv('./BestBooksEverClean_train_dataset.csv', index_col=None)
dataset

Unnamed: 0,title,series,author,genres,pages,publishYear,rating,likedPercent,price
0,Catch-22,Catch-22,Joseph Heller,"['Classics', 'Fiction', 'War', 'Historical Fic...",453,2004,3.98,90.0,3.32
1,The Catcher in the Rye,self-concluding,J.D. Salinger,"['Classics', 'Fiction', 'Young Adult', 'Litera...",277,2001,3.81,86.0,2.60
2,City of Bones,The Mortal Instruments,Cassandra Clare,"['Fantasy', 'Young Adult', 'Paranormal', 'Roma...",485,2007,4.10,92.0,6.29
3,The Stand,self-concluding,Stephen King,"['Horror', 'Fiction', 'Fantasy', 'Science Fict...",1153,1990,4.34,96.0,8.38
4,The Alchemist,self-concluding,Paulo Coelho,"['Fiction', 'Classics', 'Fantasy', 'Philosophy...",182,2014,3.88,87.0,13.22
...,...,...,...,...,...,...,...,...,...
45,Ender's Game,Ender's Saga,Orson Scott Card,"['Science Fiction', 'Fiction', 'Young Adult', ...",324,2004,4.30,95.0,4.60
46,The Giver,The Giver,Lois Lowry,"['Young Adult', 'Fiction', 'Classics', 'Dystop...",208,2006,4.13,94.0,7.15
47,Great Expectations,self-concluding,Charles Dickens,"['Classics', 'Fiction', 'Literature', 'Histori...",505,1998,3.78,87.0,0.85
48,Harry Potter and the Deathly Hallows,Harry Potter,J.K. Rowling,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",759,2007,4.62,98.0,2.85


## Categorize Numeric Cols

In [4]:
dataset['rating_cat'] = pd.cut(dataset["rating"], bins=[0, 2, 2.5, 3, 4, 4.5, 5], labels=['Muy Malo', 'Malo', 'Mediocre', 'Bueno', 'Muy Bueno', 'Excelente'])
dataset['pages_cat'] = pd.cut(dataset["pages"], bins=[0, 150, 250, 500, 1000, float('Inf')], labels=['Corto', 'Medio', 'Moderado', 'Largo', 'Muy Largo'])
dataset['price_cat'] = pd.cut(dataset["price"], bins=[0, 5, 10, 20, 30, float('Inf')], labels=['Muy Barato', 'Barato', 'Buen Precio', 'Caro', 'Muy Caro'])
dataset['publishDecade'] = (dataset['publishYear'].astype(int) // 10) * 10
# dataset

In [5]:
def genres_to_list(genres: str) -> list:
  list_of_genres = genres.removeprefix('[').removesuffix(']').split(', ')
  return list(map(lambda s: s[1:][:-1], list_of_genres))

In [6]:
dataset["genres"] = dataset["genres"].apply(genres_to_list)
# dataset

In [7]:
transactions = []
for _, row in dataset.iterrows():
    # transaction = set([row['author']] + row['genres'] + [str(row['publishDecade']), str(row['pages_cat']), str(row['price_cat']), str(row['rating_cat'])])
    # transactions.append(list(transaction))
    transactions.append(row['genres'] + [str(row['publishDecade'])])
transactions

[['Classics',
  'Fiction',
  'War',
  'Historical Fiction',
  'Humor',
  'Literature',
  'Novels',
  'Unfinished',
  'American',
  'Historical',
  '2000'],
 ['Classics',
  'Fiction',
  'Young Adult',
  'Literature',
  'School',
  'Novels',
  'Coming Of Age',
  'American',
  'High School',
  'Read For School',
  '2000'],
 ['Fantasy',
  'Young Adult',
  'Paranormal',
  'Romance',
  'Urban Fantasy',
  'Fiction',
  'Vampires',
  'Supernatural',
  'Angels',
  'Magic',
  '2000'],
 ['Horror',
  'Fiction',
  'Fantasy',
  'Science Fiction',
  'Post Apocalyptic',
  'Thriller',
  'Dystopia',
  'Apocalyptic',
  'Audiobook',
  'Classics',
  '1990'],
 ['Fiction',
  'Classics',
  'Fantasy',
  'Philosophy',
  'Novels',
  'Spirituality',
  'Literature',
  'Self Help',
  'Inspirational',
  'Adventure',
  '2010'],
 ['Fantasy',
  'Young Adult',
  'Mythology',
  'Fiction',
  'Middle Grade',
  'Adventure',
  'Childrens',
  'Urban Fantasy',
  'Greek Mythology',
  'Magic',
  '2000'],
 ['Childrens',
  'Picture

In [8]:
# Convertir la lista de géneros en un DataFrame binario
genres_expanded = dataset['genres'].explode()  # Convertir listas a filas individuales
genres_dummies = pd.get_dummies(genres_expanded).groupby(level=0).sum()  # Crear dummies y sumar para obtener binarios
# genres_expanded.head(1000)
# Unir de nuevo con el DataFrame original
# dataset = dataset.join(genres_dummies)
# dataset
# genres_dummies

In [9]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [10]:
# Codificación de las transacciones
te = TransactionEncoder()
te_ary = te.fit_transform(transactions)
df_trans = pd.DataFrame(te_ary, columns=te.columns_)

# Aplicar Apriori
frequent_itemsets = apriori(df_trans, min_support=0.3, use_colnames=True)
frequent_itemsets

# Reglas de asociación
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)
rules.sort_values("confidence", ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
36,(Novels),(Fiction),0.48,1.0,0.48,1.0,1.0,0.0,inf,0.0
98,"(School, Fiction)",(Classics),0.3,0.8,0.3,1.0,1.25,0.06,inf,0.285714
116,"(Fantasy, Young Adult)",(Fiction),0.4,1.0,0.4,1.0,1.0,0.0,inf,0.0
33,(Historical Fiction),(Fiction),0.36,1.0,0.36,1.0,1.0,0.0,inf,0.0
38,(School),(Fiction),0.3,1.0,0.3,1.0,1.0,0.0,inf,0.0
41,(Young Adult),(Fiction),0.52,1.0,0.52,1.0,1.0,0.0,inf,0.0
123,"(Novels, Literature)",(Fiction),0.4,1.0,0.4,1.0,1.0,0.0,inf,0.0
28,(Fantasy),(Fiction),0.64,1.0,0.64,1.0,1.0,0.0,inf,0.0
46,"(2000, Classics)",(Fiction),0.46,1.0,0.46,1.0,1.0,0.0,inf,0.0
106,"(Young Adult, Classics)",(Fiction),0.4,1.0,0.4,1.0,1.0,0.0,inf,0.0


In [11]:
df_trans.sum().sort_values(ascending=False)

Fiction                    50
Classics                   40
Fantasy                    32
2000                       32
Literature                 26
Young Adult                26
Novels                     24
Adventure                  19
Historical Fiction         18
School                     15
Childrens                  14
Romance                    13
Historical                 13
Audiobook                  12
Science Fiction            11
Science Fiction Fantasy     9
1990                        8
Middle Grade                8
Dystopia                    8
Read For School             8
19th Century                7
Adult                       7
Classic Literature          6
2010                        6
War                         6
American                    5
Magic                       5
High School                 4
Gothic                      4
Philosophy                  4
Poetry                      4
Animals                     4
Contemporary                4
Adult Fict

In [12]:
# rules[(rules['confidence'] < 1.0)].sort_values("confidence", ascending=False)
rules['antecedent_len'] = rules['antecedents'].apply(lambda x: len(x))
filtered_rules = rules[rules['antecedent_len'] >= 3]
filtered_rules[filtered_rules['confidence'] < 1].sort_values("confidence", ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,antecedent_len
128,"(Novels, Fiction, Literature)",(Classics),0.4,0.8,0.38,0.95,1.1875,0.06,4.0,0.263158,3
129,"(Novels, Fiction, Classics)",(Literature),0.42,0.52,0.38,0.904762,1.739927,0.1616,5.04,0.733212,3
131,"(Fiction, Literature, Classics)",(Novels),0.5,0.48,0.38,0.76,1.583333,0.14,2.166667,0.736842,3
