In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/processed/books_dedup.csv')

In [3]:
df.head()

Unnamed: 0,title,author,rating,rating_count,description,isbn,image,book_url,pages,published_year,language,genres,source
0,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,4.35,9953205.0,Winning means fame and fortune. Losing means c...,9780439023481.0,https://m.media-amazon.com/images/S/compressed...,https://www.goodreads.com/book/show/2767052-th...,374.0,2008,English,"Young Adult, Dystopia, Fiction, Fantasy, Scien...",web_scraping
1,Pride and Prejudice,Jane Austen,4.3,4827168.0,"Since its immediate success in 1813, Pride and...",9781441341709.0,https://m.media-amazon.com/images/S/compressed...,https://www.goodreads.com/book/show/1885.Pride...,279.0,1813,English,"Classics, Romance, Fiction, Historical Fiction...",web_scraping
2,To Kill a Mockingbird,Harper Lee,4.26,6911906.0,"""Shoot all the bluejays you want, if you can h...",9780060935467.0,https://m.media-amazon.com/images/S/compressed...,https://www.goodreads.com/book/show/2657.To_Ki...,323.0,1960,English,"Classics, Fiction, Historical Fiction, School,...",web_scraping
3,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,4.5,3808735.0,It's official: the evil Lord Voldemort has ret...,9780439358064.0,https://m.media-amazon.com/images/S/compressed...,https://www.goodreads.com/book/show/58613451-h...,896.0,2003,English,"Fantasy, Fiction, Young Adult, Harry Potter, M...",web_scraping
4,The Book Thief,Markus Zusak,4.39,2890214.0,Librarian's note: An alternate cover edition c...,,https://m.media-amazon.com/images/S/compressed...,https://www.goodreads.com/book/show/19063.The_...,592.0,2005,English,"Historical Fiction, Fiction, Young Adult, Clas...",web_scraping


In [4]:
df['genres'].nunique()

1275

In [5]:
df['genres'].value_counts()

genres
Fiction, Hercule Poirot (Fictitious character), Private investigators    8
Fantasy, Fiction, Young Adult, Harry Potter, Magic                       6
Young Adult, Fantasy, Vampires, Paranormal, Romance                      6
Young Adult, Dystopia, Fiction, Fantasy, Science Fiction                 4
Fantasy, Classics, Fiction, Young Adult, Childrens                       4
                                                                        ..
Early works to 1800, Aesthetics, Poetry                                  1
Expositions, Catalogs, Exhibitions                                       1
Ethics, Philosophy, Translations into English                            1
Constitutions, united states, Constitutions, United States               1
Economics, Capital, Communism                                            1
Name: count, Length: 1275, dtype: int64

In [6]:
df["genres_list"] = (
    df["genres"]
    .fillna("")
    .str.split(",")
    .apply(lambda xs: [x.strip().lower() for x in xs if x.strip()])
)

In [7]:
df["genres_list"].value_counts()

genres_list
[fiction, hercule poirot (fictitious character), private investigators]    8
[fantasy, fiction, young adult, harry potter, magic]                       6
[young adult, fantasy, vampires, paranormal, romance]                      6
[young adult, dystopia, fiction, fantasy, science fiction]                 4
[fantasy, classics, fiction, young adult, childrens]                       4
                                                                          ..
[early works to 1800, aesthetics, poetry]                                  1
[expositions, catalogs, exhibitions]                                       1
[ethics, philosophy, translations into english]                            1
[constitutions, united states, constitutions, united states]               1
[economics, capital, communism]                                            1
Name: count, Length: 1276, dtype: int64

In [8]:
STOP_GENRES = {
    # Generic terms with no discriminative value
    "fiction",
    "novel",
    "novels",
    "books",
    "general",

    # Character names / IP-based labels
    "harry potter",
    "hercule poirot",

    # Platform-specific or editorial labels
    "book club",
    "open library staff picks",

    # Format or edition information (not content-related)
    "audiobook",
    "translations into english",

    # Geographic, educational, or classification noise
    "united states",
    "school",

    # Temporal or meta-descriptive labels
    "early works to 1800",
    "description and travel",
}

In [9]:
GENRE_TO_TAG = {
    # Reading level / accessibility
    "young adult": "easy-read",
    "children": "easy-read",
    "juvenile fiction": "easy-read",
    "middle grade": "easy-read",
    "short stories": "short",

    # Mood / emotional tone
    "romance": "romantic",
    "drama": "emotional",
    "humor": "light",
    "poetry": "poetic",

    # Speculative / imaginative elements
    "fantasy": "fantasy",
    "magic": "fantasy",
    "paranormal": "fantasy",
    "vampires": "fantasy",
    "mythology": "fantasy",
    "science fiction": "sci-fi",
    "dystopia": "dark",
    "horror": "dark",

    # Narrative pace / tension
    "thriller": "fast-paced",
    "adventure": "fast-paced",
    "mystery": "suspenseful",
    "crime": "suspenseful",
    "mystery thriller": "suspenseful",
    "private investigators": "suspenseful",

    # Intellectual depth / themes
    "classics": "classic",
    "classic literature": "classic",
    "literature": "literary",
    "philosophy": "deep",
    "ethics": "deep",
    "politics": "deep",
    "economics": "deep",
    "psychology": "psychological",

    # Time period / historical context
    "historical fiction": "historical",
    "history": "historical",
    "historical": "historical",

    # Real-world topics
    "biography": "real-life",
    "nonfiction": "real-life",
    "social life and customs": "real-life",

    # Nature and travel
    "animals": "nature",
    "travel": "travel",
}

In [10]:
def genres_to_tags(genres_list, max_tags=5):
    tags = []
    for g in genres_list:
        if g in STOP_GENRES:
            continue
        if g in GENRE_TO_TAG:
            tags.append(GENRE_TO_TAG[g])

    seen = set()
    uniq = []
    for t in tags:
        if t not in seen:
            seen.add(t)
            uniq.append(t)

    return uniq[:max_tags]

In [11]:
df["tags"] = df["genres_list"].apply(genres_to_tags)
df["tags_str"] = df["tags"].apply(lambda xs: ", ".join(xs))

In [12]:
df["tag_count"] = df["tags"].apply(len)
df["tag_count"].value_counts().sort_index()

tag_count
0    493
1    385
2    229
3    202
4     58
5      3
Name: count, dtype: int64

In [13]:
def enrich_tags(row, min_tags=3, max_tags=5):
    tags = list(row["tags"]) 
    tags = list(dict.fromkeys(tags))  

    # ---------- pages ----------
    pages = row.get("pages")
    if pages and pages <= 250:
        tags.append("short")
    if pages and pages <= 350:
        tags.append("easy-read")

    # ---------- popular ----------
    rating = row.get("rating")
    rating_count = row.get("rating_count")
    if rating and rating >= 4.0 and rating_count and rating_count >= 1000:
        tags.append("popular")

    # ---------- mood ----------
    if not any(t in tags for t in ["dark", "deep"]):
        tags.append("light")

    # ---------- clean ----------3
    tags = list(dict.fromkeys(tags))

    if len(tags) < min_tags:
        if "easy-read" not in tags:
            tags.append("easy-read")
        if "light" not in tags:
            tags.append("light")

    return tags[:max_tags]

In [14]:
df["tags"] = df.apply(enrich_tags, axis=1)
df["tag_count"] = df["tags"].apply(len)
df["tag_count"].value_counts().sort_index()

tag_count
2    290
3    499
4    316
5    265
Name: count, dtype: int64

In [15]:
def final_touch(row):
    tags = list(row["tags"])

    if len(tags) == 2:
        if row.get("pages") and row["pages"] <= 300:
            tags.append("easy-read")
        elif not any(t in tags for t in ["dark", "deep"]):
            tags.append("light")

    return tags[:5]

In [16]:
df["tags"] = df.apply(final_touch, axis=1)
df["tag_count"] = df["tags"].apply(len)
df["tag_count"].value_counts().sort_index()

tag_count
3    789
4    316
5    265
Name: count, dtype: int64

In [17]:
df["tags"] = df["genres_list"].apply(genres_to_tags)
df["tags_str"] = df["tags"].apply(lambda xs: ", ".join(xs))

In [18]:
df["tag_count"].describe()

count    1370.000000
mean        3.617518
std         0.789624
min         3.000000
25%         3.000000
50%         3.000000
75%         4.000000
max         5.000000
Name: tag_count, dtype: float64

In [19]:
df.head()

Unnamed: 0,title,author,rating,rating_count,description,isbn,image,book_url,pages,published_year,language,genres,source,genres_list,tags,tags_str,tag_count
0,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,4.35,9943135.0,,9780439023481.0,https://m.media-amazon.com/images/S/compressed...,https://www.goodreads.com/book/show/2767052-th...,374.0,,,"Young Adult, Dystopia, Fiction, Fantasy, Scien...",web_scraping,"[young adult, dystopia, fiction, fantasy, scie...","[easy-read, dark, fantasy, sci-fi]","easy-read, dark, fantasy, sci-fi",5
1,Pride and Prejudice,Jane Austen,4.3,4821169.0,,9781441341709.0,https://m.media-amazon.com/images/S/compressed...,https://www.goodreads.com/book/show/1885.Pride...,279.0,,,"Classics, Romance, Fiction, Historical Fiction...",web_scraping,"[classics, romance, fiction, historical fictio...","[classic, romantic, historical]","classic, romantic, historical",5
2,To Kill a Mockingbird,Harper Lee,4.26,6905680.0,,9780060935467.0,https://m.media-amazon.com/images/S/compressed...,https://www.goodreads.com/book/show/2657.To_Ki...,323.0,,,"Classics, Fiction, Historical Fiction, School,...",web_scraping,"[classics, fiction, historical fiction, school...","[classic, historical, literary]","classic, historical, literary",5
3,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,4.5,3805184.0,,9780439358064.0,https://m.media-amazon.com/images/S/compressed...,https://www.goodreads.com/book/show/58613451-h...,896.0,,,"Fantasy, Fiction, Young Adult, Harry Potter, M...",web_scraping,"[fantasy, fiction, young adult, harry potter, ...","[fantasy, easy-read]","fantasy, easy-read",4
4,The Book Thief,Markus Zusak,4.39,2887607.0,,,https://m.media-amazon.com/images/S/compressed...,https://www.goodreads.com/book/show/19063.The_...,592.0,,,"Historical Fiction, Fiction, Young Adult, Clas...",web_scraping,"[historical fiction, fiction, young adult, cla...","[historical, easy-read, classic]","historical, easy-read, classic",5


In [20]:
df["tags_str"] = df["tags"].map(lambda xs: ", ".join(xs) if isinstance(xs, list) else "")
df = df.drop(columns=["tags"], errors="ignore") 

In [21]:
df["tags_str"].isna().sum()

np.int64(0)

In [22]:
df.to_csv(
    "../data/processed/final_data.csv",
    index=False,
    encoding="utf-8-sig"
)