In [141]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

## Load Data

In [142]:
dataset = pd.read_csv('./BestBooksEverClean_train_dataset.csv', index_col=None)
dataset

Unnamed: 0,title,series,author,genres,pages,publishYear,rating,likedPercent,price
0,Cyrano de Bergerac,self-concluding,Edmond Rostand,"['Classics', 'Plays', 'Fiction', 'Drama', 'Fra...",240,2003,4.07,94.0,3.16
1,Winnie-the-Pooh,Winnie-the-Pooh,A.A. Milne,"['Classics', 'Childrens', 'Fiction', 'Fantasy'...",145,2001,4.34,96.0,5.30
2,Walks Away Woman,self-concluding,Ki Longfellow,"['Survival', 'Adventure', 'Fiction', 'Literatu...",254,2013,4.33,98.0,6.08
3,Harriet the Spy,Harriet the Spy,Louise Fitzhugh,"['Childrens', 'Fiction', 'Young Adult', 'Middl...",300,2002,3.95,91.0,1.91
4,Bared to You,Crossfire,Sylvia Day,"['Romance', 'Erotica', 'Contemporary', 'Adult'...",334,2014,4.18,92.0,5.15
...,...,...,...,...,...,...,...,...,...
995,Fablehaven,Fablehaven,Brandon Mull,"['Fantasy', 'Young Adult', 'Middle Grade', 'Fi...",351,2006,4.09,93.0,6.04
996,Divine Secrets of the Ya-Ya Sisterhood,Ya Yas,Rebecca Wells,"['Fiction', 'Chick Lit', 'Contemporary', 'Adul...",383,2004,3.83,90.0,1.71
997,The Last Olympian,Percy Jackson and the Olympians,Rick Riordan,"['Fantasy', 'Young Adult', 'Mythology', 'Ficti...",381,2009,4.51,98.0,7.44
998,"I, Claudius",Claudius,Robert Graves,"['Historical Fiction', 'Fiction', 'Classics', ...",468,1989,4.23,95.0,3.38


## Categorize Numeric Cols

In [143]:
dataset['rating_cat'] = pd.cut(dataset["rating"], bins=[0, 2, 2.5, 3, 4, 4.5, 5], labels=['Muy Malo', 'Malo', 'Mediocre', 'Bueno', 'Muy Bueno', 'Excelente'])
dataset['pages_cat'] = pd.cut(dataset["pages"], bins=[0, 150, 250, 500, 1000, float('Inf')], labels=['Corto', 'Medio', 'Moderado', 'Largo', 'Muy Largo'])
dataset['price_cat'] = pd.cut(dataset["price"], bins=[0, 5, 10, 20, 30, float('Inf')], labels=['Barato', 'Buen Precio', 'Moderado', 'Caro', 'Muy Caro'])
dataset

Unnamed: 0,title,series,author,genres,pages,...,likedPercent,price,rating_cat,pages_cat,price_cat
0,Cyrano de Bergerac,self-concluding,Edmond Rostand,"['Classics', 'Plays', 'Fiction', 'Drama', 'Fra...",240,...,94.0,3.16,Muy Bueno,Medio,Barato
1,Winnie-the-Pooh,Winnie-the-Pooh,A.A. Milne,"['Classics', 'Childrens', 'Fiction', 'Fantasy'...",145,...,96.0,5.30,Muy Bueno,Corto,Buen Precio
2,Walks Away Woman,self-concluding,Ki Longfellow,"['Survival', 'Adventure', 'Fiction', 'Literatu...",254,...,98.0,6.08,Muy Bueno,Moderado,Buen Precio
3,Harriet the Spy,Harriet the Spy,Louise Fitzhugh,"['Childrens', 'Fiction', 'Young Adult', 'Middl...",300,...,91.0,1.91,Bueno,Moderado,Barato
4,Bared to You,Crossfire,Sylvia Day,"['Romance', 'Erotica', 'Contemporary', 'Adult'...",334,...,92.0,5.15,Muy Bueno,Moderado,Buen Precio
...,...,...,...,...,...,...,...,...,...,...,...
995,Fablehaven,Fablehaven,Brandon Mull,"['Fantasy', 'Young Adult', 'Middle Grade', 'Fi...",351,...,93.0,6.04,Muy Bueno,Moderado,Buen Precio
996,Divine Secrets of the Ya-Ya Sisterhood,Ya Yas,Rebecca Wells,"['Fiction', 'Chick Lit', 'Contemporary', 'Adul...",383,...,90.0,1.71,Bueno,Moderado,Barato
997,The Last Olympian,Percy Jackson and the Olympians,Rick Riordan,"['Fantasy', 'Young Adult', 'Mythology', 'Ficti...",381,...,98.0,7.44,Excelente,Moderado,Buen Precio
998,"I, Claudius",Claudius,Robert Graves,"['Historical Fiction', 'Fiction', 'Classics', ...",468,...,95.0,3.38,Muy Bueno,Moderado,Barato


In [144]:
dataset['author'] = 'Author_' + dataset['author']
dataset['series'] = 'Series_' + dataset['series']
dataset['publishYear'] = 'PublishYear_' + dataset['publishYear'].astype(str)
dataset

Unnamed: 0,title,series,author,genres,pages,...,likedPercent,price,rating_cat,pages_cat,price_cat
0,Cyrano de Bergerac,Series_self-concluding,Author_Edmond Rostand,"['Classics', 'Plays', 'Fiction', 'Drama', 'Fra...",240,...,94.0,3.16,Muy Bueno,Medio,Barato
1,Winnie-the-Pooh,Series_Winnie-the-Pooh,Author_A.A. Milne,"['Classics', 'Childrens', 'Fiction', 'Fantasy'...",145,...,96.0,5.30,Muy Bueno,Corto,Buen Precio
2,Walks Away Woman,Series_self-concluding,Author_Ki Longfellow,"['Survival', 'Adventure', 'Fiction', 'Literatu...",254,...,98.0,6.08,Muy Bueno,Moderado,Buen Precio
3,Harriet the Spy,Series_Harriet the Spy,Author_Louise Fitzhugh,"['Childrens', 'Fiction', 'Young Adult', 'Middl...",300,...,91.0,1.91,Bueno,Moderado,Barato
4,Bared to You,Series_Crossfire,Author_Sylvia Day,"['Romance', 'Erotica', 'Contemporary', 'Adult'...",334,...,92.0,5.15,Muy Bueno,Moderado,Buen Precio
...,...,...,...,...,...,...,...,...,...,...,...
995,Fablehaven,Series_Fablehaven,Author_Brandon Mull,"['Fantasy', 'Young Adult', 'Middle Grade', 'Fi...",351,...,93.0,6.04,Muy Bueno,Moderado,Buen Precio
996,Divine Secrets of the Ya-Ya Sisterhood,Series_Ya Yas,Author_Rebecca Wells,"['Fiction', 'Chick Lit', 'Contemporary', 'Adul...",383,...,90.0,1.71,Bueno,Moderado,Barato
997,The Last Olympian,Series_Percy Jackson and the Olympians,Author_Rick Riordan,"['Fantasy', 'Young Adult', 'Mythology', 'Ficti...",381,...,98.0,7.44,Excelente,Moderado,Buen Precio
998,"I, Claudius",Series_Claudius,Author_Robert Graves,"['Historical Fiction', 'Fiction', 'Classics', ...",468,...,95.0,3.38,Muy Bueno,Moderado,Barato


In [145]:
def genres_to_list(genres: str) -> list:
  list_of_genres = genres.removeprefix('[').removesuffix(']').split(', ')
  return list(map(lambda s: s[1:][:-1], list_of_genres))

In [146]:
dataset["genres"] = dataset["genres"].apply(genres_to_list)
dataset

Unnamed: 0,title,series,author,genres,pages,...,likedPercent,price,rating_cat,pages_cat,price_cat
0,Cyrano de Bergerac,Series_self-concluding,Author_Edmond Rostand,"[Classics, Plays, Fiction, Drama, France, Roma...",240,...,94.0,3.16,Muy Bueno,Medio,Barato
1,Winnie-the-Pooh,Series_Winnie-the-Pooh,Author_A.A. Milne,"[Classics, Childrens, Fiction, Fantasy, Animal...",145,...,96.0,5.30,Muy Bueno,Corto,Buen Precio
2,Walks Away Woman,Series_self-concluding,Author_Ki Longfellow,"[Survival, Adventure, Fiction, Literature, Hum...",254,...,98.0,6.08,Muy Bueno,Moderado,Buen Precio
3,Harriet the Spy,Series_Harriet the Spy,Author_Louise Fitzhugh,"[Childrens, Fiction, Young Adult, Middle Grade...",300,...,91.0,1.91,Bueno,Moderado,Barato
4,Bared to You,Series_Crossfire,Author_Sylvia Day,"[Romance, Erotica, Contemporary, Adult, Contem...",334,...,92.0,5.15,Muy Bueno,Moderado,Buen Precio
...,...,...,...,...,...,...,...,...,...,...,...
995,Fablehaven,Series_Fablehaven,Author_Brandon Mull,"[Fantasy, Young Adult, Middle Grade, Fiction, ...",351,...,93.0,6.04,Muy Bueno,Moderado,Buen Precio
996,Divine Secrets of the Ya-Ya Sisterhood,Series_Ya Yas,Author_Rebecca Wells,"[Fiction, Chick Lit, Contemporary, Adult Ficti...",383,...,90.0,1.71,Bueno,Moderado,Barato
997,The Last Olympian,Series_Percy Jackson and the Olympians,Author_Rick Riordan,"[Fantasy, Young Adult, Mythology, Fiction, Mid...",381,...,98.0,7.44,Excelente,Moderado,Buen Precio
998,"I, Claudius",Series_Claudius,Author_Robert Graves,"[Historical Fiction, Fiction, Classics, Histor...",468,...,95.0,3.38,Muy Bueno,Moderado,Barato


In [147]:
# Convertir la lista de géneros en un DataFrame binario
genres_expanded = dataset['genres'].explode()  # Convertir listas a filas individuales
genres_dummies = pd.get_dummies(genres_expanded).groupby(level=0).sum()  # Crear dummies y sumar para obtener binarios
# genres_expanded.head(1000)
# Unir de nuevo con el DataFrame original
dataset = dataset.join(genres_dummies)
dataset

Unnamed: 0,title,series,author,genres,pages,...,Young Adult,Young Adult Contemporary,Young Adult Fantasy,Young Adult Romance,Zombies
0,Cyrano de Bergerac,Series_self-concluding,Author_Edmond Rostand,"[Classics, Plays, Fiction, Drama, France, Roma...",240,...,0,0,0,0,0
1,Winnie-the-Pooh,Series_Winnie-the-Pooh,Author_A.A. Milne,"[Classics, Childrens, Fiction, Fantasy, Animal...",145,...,1,0,0,0,0
2,Walks Away Woman,Series_self-concluding,Author_Ki Longfellow,"[Survival, Adventure, Fiction, Literature, Hum...",254,...,0,0,0,0,0
3,Harriet the Spy,Series_Harriet the Spy,Author_Louise Fitzhugh,"[Childrens, Fiction, Young Adult, Middle Grade...",300,...,1,0,0,0,0
4,Bared to You,Series_Crossfire,Author_Sylvia Day,"[Romance, Erotica, Contemporary, Adult, Contem...",334,...,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
995,Fablehaven,Series_Fablehaven,Author_Brandon Mull,"[Fantasy, Young Adult, Middle Grade, Fiction, ...",351,...,1,0,1,0,0
996,Divine Secrets of the Ya-Ya Sisterhood,Series_Ya Yas,Author_Rebecca Wells,"[Fiction, Chick Lit, Contemporary, Adult Ficti...",383,...,0,0,0,0,0
997,The Last Olympian,Series_Percy Jackson and the Olympians,Author_Rick Riordan,"[Fantasy, Young Adult, Mythology, Fiction, Mid...",381,...,1,0,0,0,0
998,"I, Claudius",Series_Claudius,Author_Robert Graves,"[Historical Fiction, Fiction, Classics, Histor...",468,...,0,0,0,0,0


In [148]:
dataset.columns

Index(['title', 'series', 'author', 'genres', 'pages', 'publishYear', 'rating',
       'likedPercent', 'price', 'rating_cat',
       ...
       'Womens', 'Womens Fiction', 'World War I', 'World War II', 'Writing',
       'Young Adult', 'Young Adult Contemporary', 'Young Adult Fantasy',
       'Young Adult Romance', 'Zombies'],
      dtype='object', length=338)

In [149]:
# Crear transacciones como listas de atributos presentes en cada libro
transacciones = dataset.drop("genres", axis=1).drop("price", axis=1).drop("rating", axis=1).drop("pages", axis=1).drop("likedPercent", axis=1)
transacciones

Unnamed: 0,title,series,author,publishYear,rating_cat,...,Young Adult,Young Adult Contemporary,Young Adult Fantasy,Young Adult Romance,Zombies
0,Cyrano de Bergerac,Series_self-concluding,Author_Edmond Rostand,PublishYear_2003,Muy Bueno,...,0,0,0,0,0
1,Winnie-the-Pooh,Series_Winnie-the-Pooh,Author_A.A. Milne,PublishYear_2001,Muy Bueno,...,1,0,0,0,0
2,Walks Away Woman,Series_self-concluding,Author_Ki Longfellow,PublishYear_2013,Muy Bueno,...,0,0,0,0,0
3,Harriet the Spy,Series_Harriet the Spy,Author_Louise Fitzhugh,PublishYear_2002,Bueno,...,1,0,0,0,0
4,Bared to You,Series_Crossfire,Author_Sylvia Day,PublishYear_2014,Muy Bueno,...,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
995,Fablehaven,Series_Fablehaven,Author_Brandon Mull,PublishYear_2006,Muy Bueno,...,1,0,1,0,0
996,Divine Secrets of the Ya-Ya Sisterhood,Series_Ya Yas,Author_Rebecca Wells,PublishYear_2004,Bueno,...,0,0,0,0,0
997,The Last Olympian,Series_Percy Jackson and the Olympians,Author_Rick Riordan,PublishYear_2009,Excelente,...,1,0,0,0,0
998,"I, Claudius",Series_Claudius,Author_Robert Graves,PublishYear_1989,Muy Bueno,...,0,0,0,0,0


In [150]:
# Codificación de las transacciones
te = TransactionEncoder()
te_ary = te.fit(transacciones).transform(transacciones)
df_trans = pd.DataFrame(te_ary, columns=te.columns_)

# Aplicar Apriori
frequent_itemsets = apriori(df_trans, min_support=0.1, use_colnames=True)

# Reglas de asociación
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

   antecedents consequents  support  confidence      lift
0          (e)         (a)    0.109    0.564767  2.956894
1          (a)         (e)    0.109    0.570681  2.956894
2          (i)         (a)    0.127    0.619512  3.243519
3          (a)         (i)    0.127    0.664921  3.243519
4          (n)         (a)    0.102    0.641509  3.358688
..         ...         ...      ...         ...       ...
23         (s)         (i)    0.100    0.709220  3.459609
24         (i)         (t)    0.115    0.560976  3.261486
25         (t)         (i)    0.115    0.668605  3.261486
26         (r)         (t)    0.105    0.589888  3.429579
27         (t)         (r)    0.105    0.610465  3.429579

[28 rows x 5 columns]
