#### Imports

In [2]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from tqdm import tqdm
import json
import time
import re
import random
import pandas as pd
from secret_key import google_key

##### LLM instance initiation

In [3]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-lite",
    temperature=0,
    google_api_key=google_key
)

##### An example of what the language chain will do

In [4]:
prompt1 = ChatPromptTemplate.from_template(
    "What is the original language of {Title} written by {Author}? Relpy with a single word in English"
)

chain1 = prompt1 | llm

In [5]:
response = chain1.invoke({"Title": "أهل الكهف", "Author": "توفيق الحكيم"})
response.content

'Arabic'

#### Data Load & Preprocessing

In [6]:
df = pd.read_csv("goodreads_data.csv")
df.sample()

Unnamed: 0,Book Id,Title,Author,Additional Authors,ISBN,ISBN13,My Rating,Average Rating,Publisher,Number of Pages,Year Published,Original Publication Year,Date Read,Date Added,Bookshelves,My Review
198,16429619,"The Kiss of Deception (The Remnant Chronicles,...",Mary E. Pearson,,,,3,3.92,Henry Holt,492.0,2014.0,2014.0,22/01/2021,7/01/2021,read,


In [7]:
df = df.drop(columns=['ISBN', 'ISBN13', 'My Review', 'Additional Authors', 'Publisher'], axis=1)
df.sample()

Unnamed: 0,Book Id,Title,Author,My Rating,Average Rating,Number of Pages,Year Published,Original Publication Year,Date Read,Date Added,Bookshelves
305,968,"The Da Vinci Code (Robert Langdon, #2)",Dan Brown,0,3.93,489.0,2006.0,2003.0,,31/12/2020,to-read


#### A function to invoke the chain with respect to thr rate-limit

In [8]:
def invoke_chain(chain, input_data, max_retries=3, base_delay=1):
    for attempt in range(max_retries):
        try:
            result = chain.invoke(input_data)
            return result
        except Exception as e:
            if "rate_limit" in str(e).lower() or "429" in str(e):
                if attempt < max_retries - 1:
                    delay = base_delay * (2 ** attempt) + random.uniform(0,1)
                    print(f"Rate limited. Retrying in {delay:.1f} seconds...")
                    time.sleep(delay)
                else:
                    print(f"Max retries reached. Error: {e}")
                    raise e
            else:
                raise e

#### A function to clean the parsed LLM response

In [9]:
def parse_llm_response(response, num_items, categories):
    cleaned = response.strip()

    cleaned = re.sub(r"```json|```", "", cleaned).strip()

    try:
        result = json.loads(cleaned)
        if isinstance(result, list) and len(result) >= num_items:
            return result[:num_items]
    except:
        pass

    found = []
    for g in categories:
        if g.lower() in cleaned.lower():
            found.append(g)

    if found:
        while len(found) < num_items:
            found.append(found[0])
        return found[:num_items]

    return ["Unknown"] * num_items

#### A function to add a new column (in batches) to the existing data frame

In [10]:
def add_column(df, classes, category, chain, batch_size=10, delay_between_batches=2):
    all_classes = []
    genres_str = ", ".join(classes)

    for i in tqdm(range(0, len(df), batch_size)):
        batch = df.iloc[i:i+batch_size]

        book_text = "\n".join([
            f"{idx+1}. {row['Title']} by {row['Author']}"
            for idx, row in batch.iterrows()
        ])

        try:
            response = invoke_chain(
                chain, 
                {"books": book_text, "genres": genres_str}
            )

            parsed = parse_llm_response(response, len(batch), classes)
            all_classes.extend(parsed)

        except Exception as e:
            print(f"Error in batch {i//batch_size + 1}: {e}")
            all_classes.extend(["Unknown"] * len(batch))

        if i + batch_size < len(df):
            time.sleep(delay_between_batches)

    df[category] = all_classes
    return df


##### Testing the language chain with extracting the language of the book

In [11]:

languages_list = ["English", "German", "Russian", "Arabic", "Swedish", "Japanese", "Portugese"]

language_prompt = ChatPromptTemplate.from_template("""
Determine the language of these books based on their titles and authors.

Books:
{books}

For each book, return only the language name in English.
Return as a JSON array with one language per book in the same order.
Example: ["English", "French", "Spanish"]

Only return the array, no other text.
""")

language_chain = language_prompt | llm | StrOutputParser()



In [12]:
df = add_column(df.copy(), languages_list, "Language", language_chain)

 47%|████▋     | 15/32 [00:40<00:45,  2.66s/it]Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 15, model: gemini-2.5-flash-lite
Please retry in 31.179498861s. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash-lite"
  }
  quota_dimensions {
   

In [13]:
df.sample(10)

Unnamed: 0,Book Id,Title,Author,My Rating,Average Rating,Number of Pages,Year Published,Original Publication Year,Date Read,Date Added,Bookshelves,Language
268,3777732,"City of Glass (The Mortal Instruments, #3)",Cassandra Clare,5,4.27,541.0,2009.0,2009.0,1/07/2020,19/08/2020,read,English
2,59651555,"Stuck with You (The STEMinist Novellas, #2)",Ali Hazelwood,2,3.56,127.0,2022.0,2022.0,20/09/2025,20/09/2025,read,English
58,3335314,التفكير فريضة إسلامية,عباس محمود العقاد,4,3.94,152.0,2008.0,1962.0,15/04/2024,8/04/2024,read,Arabic
288,35702241,The Shadows Between Us (The Shadows Between Us...,Tricia Levenseller,3,3.82,326.0,2020.0,2020.0,2/04/2021,31/03/2021,read,English
0,5935144,خان الخليلي,Naguib Mahfouz,4,3.92,280.0,2006.0,1945.0,2/11/2025,2/01/2024,read,Arabic
235,28686840,Holding Up the Universe,Jennifer Niven,3,3.67,391.0,2016.0,2016.0,1/01/2019,23/06/2020,read,English
220,27883214,"Caraval (Caraval, #1)",Stephanie Garber,4,3.97,407.0,2017.0,2016.0,21/08/2021,9/04/2021,read,English
147,58064046,Gallant,V.E. Schwab,3,3.7,338.0,2022.0,2022.0,1/08/2022,21/12/2021,read,English
309,1582996,"City of Ashes (The Mortal Instruments, #2)",Cassandra Clare,4,4.1,453.0,2008.0,2008.0,19/08/2020,27/07/2020,read,English
40,198281740,The Life Impossible,Matt Haig,2,3.48,324.0,2024.0,2024.0,1/11/2024,23/10/2024,read,English


In [14]:
df['Language'].value_counts()

Language
English       246
Arabic         49
Russian         8
Japanese        4
Persian         2
Swedish         2
French          1
German          1
Portuguese      1
Name: count, dtype: int64

#### Applying the same chain to extract book genres

In [15]:

genre_list = [
    "Non-Fiction", "Fantasy", "Science Fiction", "Romance", "Mystery",
    "Thriller", "Young Adult", "Poetry", "Classic"
]

genre_prompt = ChatPromptTemplate.from_template("""
You are a book classification model.

Given the following books (title and author), classify each one into EXACTLY one of these genres:
{genres}

Books:
{books}

Return ONLY a JSON array containing the genre for each book in the same order.
Example: ["Fiction", "Romance", "Fantasy"]

Only return the JSON array. No explanation.
""")

genre_chain = genre_prompt | llm | StrOutputParser()



In [16]:
df= add_column(df.copy(), genre_list, "Genre", genre_chain)

 47%|████▋     | 15/32 [00:39<00:44,  2.61s/it]Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 15, model: gemini-2.5-flash-lite
Please retry in 25.277116696s. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash-lite"
  }
  quota_dimensions {
   

In [17]:
df.sample(10)

Unnamed: 0,Book Id,Title,Author,My Rating,Average Rating,Number of Pages,Year Published,Original Publication Year,Date Read,Date Added,Bookshelves,Language,Genre
219,36329818,"Legendary (Caraval, #2)",Stephanie Garber,5,4.1,451.0,2018.0,2018.0,28/08/2021,16/07/2021,read,English,Fantasy
83,7651652,E.S.P.,أحمد خالد توفيق,0,3.6,329.0,2009.0,2009.0,,21/12/2023,read,Arabic,Science Fiction
303,17699853,"Chain of Gold (The Last Hours, #1)",Cassandra Clare,3,4.37,582.0,2020.0,2020.0,24/12/2020,21/12/2020,read,English,Fantasy
180,16130537,The Humans,Matt Haig,0,4.08,285.0,2013.0,2013.0,,5/12/2021,to-read,English,Science Fiction
126,23272028,"A Study in Charlotte (Charlotte Holmes, #1)",Brittany Cavallaro,0,3.75,336.0,2016.0,2016.0,,22/02/2023,to-read,English,Young Adult
250,22540125,"After We Collided (After, #2)",Anna Todd,1,3.72,674.0,2014.0,2013.0,1/01/2018,29/04/2021,read,English,Romance
156,59009928,"There Are No Saints (Sinners, #1)",Sophie Lark,1,3.79,258.0,2021.0,2021.0,12/06/2022,12/02/2022,read,English,Romance
167,15717943,"Hopeless (Hopeless, #1)",Colleen Hoover,3,4.22,410.0,2012.0,2012.0,20/02/2022,6/02/2022,read,English,Romance
12,58784475,"Tomorrow, and Tomorrow, and Tomorrow",Gabrielle Zevin,3,4.12,401.0,2022.0,2022.0,6/07/2025,11/06/2025,read,English,Romance
287,43263680,"Ninth House (Alex Stern, #1)",Leigh Bardugo,0,4.0,461.0,2019.0,2019.0,,14/04/2021,to-read,English,Young Adult


In [18]:
df['Genre'].value_counts()

Genre
Classic               72
Fantasy               68
Young Adult           60
Romance               45
Non-Fiction           31
Mystery               20
Thriller               8
Science Fiction        8
Poetry                 1
Historical Fiction     1
Name: count, dtype: int64

In [19]:
df.to_csv("goodreads.csv")