In [1]:
import random
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
libgen_topics = {
    "Math": {
        "keyword": "topicid307",
        "category": "Math"
    },
    "Optimization": {
        "keyword": "topicid136",
        "category": "Math, Optimization"
    },
    "Probality": {
        "keyword": "topicid119",
        "category": "Math, Probality"
    },
    "Algebra": {
        "keyword": "topicid114",
        "category": "Math, Algebra"
    },
    "Children": {
        "keyword": "topicid106",
        "category": "Literature, Children"
    },
    "Comic": {
        "keyword": "topicid10",
        "category": "Literature, Comic"
    },
    "Computer": {
        "keyword": "topicid6",
        "category": "Computer"
    },
    "CreativeThinking": {
        "keyword": "topicid204",
        "category": "Psychology, CreativeThinking"
    },
    "Databases": {
        "keyword": "topicid76",
        "category": "Computer, Databases"
    },
    "Diabetes": {
        "keyword": "topicid155",
        "category": "Medicine, Diabetes"
    },
    "Fantasy": {
        "keyword": "topicid112",
        "category": "Literature, Fantasy"
    },
    "Literature": {
        "keyword": "topicid164",
        "category": "Literature"
    },
    "LoveErotic": {
        "keyword": "topicid201",
        "category": "Psychology, LoveErotic"
    },
    "Medicine": {
        "keyword": "topicid147",
        "category": "Medicine"
    },
    "OperatingSystem": {
        "keyword": "topicid85",
        "category": "Computer, OperatingSystem"
    },
    "Pharmacology": {
        "keyword": "topicid173",
        "category": "Medicine, Pharmacology"
    },
    "Programming": {
        "keyword": "topicid87",
        "category": "Computer, Programming"
    },
    "Psychology": {
        "keyword": "topicid198",
        "category": "Psychology"
    },
    "Security": {
        "keyword": "topicid77",
        "category": "Computer, Security"
    },
    "Therapy": {
        "keyword": "topicid172",
        "category": "Medicine, Therapy"
    }
}

len(libgen_topics)

20

In [3]:
search_url = (
    f"https://libgen.is/search.php?&req=[SEARCH_VALUE]"
    "&res=100&phrase=1&view=simple"
    "&sort=year&sortmode=DESC"
)

search_by_author_url = (
    f"https://libgen.is/[AUTHOR_URL]"
    "&res=50&phrase=1&view=simple"
    "&sort=year&sortmode=DESC"
)

non_fiction_book_url = (
    f"https://libgen.is/book/index.php?md5=[MD5_HASH]"
)

fiction_book_url = (
    f"https://libgen.is/fiction/[MD5_HASH]"
)

min_year = 2004

def get_book_by_topic(topic_name):
    topic = libgen_topics[topic_name]
    topic_keyword = topic["keyword"]
    topic_category = topic["category"]
    url = search_url.replace("[SEARCH_VALUE]", topic_keyword)
    # print(url)
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the table
        table = soup.find('table', {'class': 'c'})

        # Initialize a list to store the data
        data = {}
        headers = []

        # Extract headers
        header_row = table.find('tr')
        headers = [header.text.strip() for header in header_row.find_all('td')]

        # Iterate through each row in the table skipping the header row
        for row in table.find_all('tr')[1:]:  # Skip the first row assuming it's headers
            if len(data) >= 100:
                break
            authors = []
            book_url = None
            for i, cell in enumerate(row.find_all('td')):
                try:
                    if headers[i] == "Title":
                        a = cell.find('a')
                        if a and a.has_attr('href') and "book/index.php" in a['href']:
                            book_url = a['href']
                            book_title = a.text.strip()
                    elif headers[i] == "Author(s)":
                        links = cell.find_all('a')
                        # Extract the href attribute from each link
                        for link in links:
                            if link.has_attr('href'):
                                authors.append({
                                    "author_name": link.text.strip(), 
                                    "author_url": link['href']
                                })
                    elif headers[i] == "Year":
                        year = cell.text.strip()
                        if int(year) < min_year:
                            break
                except:
                    break
                    
            if book_url:
                data[book_url] = {
                    "title": book_title,
                    "author(s)": authors,
                    "category": topic_category
                }

        return data


columns = [
    "hasID", "hasTitle", "hasDescription", "hasNumberPage", 
    "hasPublicYear", "hasExtension", "hasSize", "hasAuthor", "hasPublisher",
    "hasLanguage", "hasRate", "hasTags", "hasCategory",
]

def get_book_info(book_url):
    url = f"https://libgen.is/{book_url}"
    book_info = {}

    def extract_field(field_name):
        value = None
        if "Pages" in field_name:
            field_name = "Pages"
        else:
            field_name += ":"
        tag = soup.find('td', string=lambda x: x and field_name in x)
        if tag and tag.find_next_sibling('td'):
            value = tag.find_next_sibling('td').text.strip()
        return value
    
    def extract_description():
        # Find the <td> element with colspan attribute of 4 and specific padding style
        description_tag = soup.find(
            'td', 
            attrs={'colspan': '4', 'style': lambda x: x and 'padding: 25px' in x}
        )
        description_text = ' '.join(description_tag.text.split()) if description_tag else ""
        return description_text
    
    def get_list_author(author_text):
        authors = []
        filter = author_text.split(";")
        for x in filter:
            if x.strip():
                authors += [y.strip() for y in x.split(",") if y.strip()]
        return authors

    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract Field
        book_info['hasID'] = extract_field("ID")
        book_info['hasTitle'] = extract_field("Title")
        book_info['hasDescription'] = extract_description()
        book_info['hasNumberPage'] = int(extract_field("Pages (biblio\tech)").split("\\")[0].strip())
        book_info['hasPublicYear'] = int(extract_field("Year"))
        book_info['hasExtension'] = extract_field("Extension")
        book_info['hasSize'] = extract_field("Size").split("(")[0].strip()
        book_info['hasAuthor'] = get_list_author(extract_field("Author(s)"))
        book_info['hasPublisher'] = extract_field("Publisher")
        book_info['hasLanguage'] = extract_field("Language")
        book_info['hasRate'] = round(random.uniform(3, 5), 2) # Fake rating
        topic = extract_field("Topic")
        book_info['hasTags'] = [e for e in topic.split("\\") if e]
        # book_info['hasCategory'] = topic.split("\\")[-1].strip()

    return book_info


In [4]:
book_data = {}

for topic in tqdm(libgen_topics):
    topic_data = get_book_by_topic(topic)
    book_data.update(topic_data)


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [00:34<00:00,  1.73s/it]


In [5]:
book_info_list = []

for book_url, data in tqdm(book_data.items()):
    try:
        book_info = get_book_info(book_url)
        book_info['hasCategory'] = data["category"]
        if book_info["hasLanguage"].lower() == "tamil":
            continue
        book_info_list.append(book_info)
    except:
        pass

100%|██████████| 1351/1351 [41:30<00:00,  1.84s/it]


In [6]:

book_info_df = pd.DataFrame(book_info_list)
book_info_df = book_info_df.drop_duplicates(subset='hasTitle', keep="last")
book_info_df.to_csv("book_data_enrich.csv")

In [7]:
book_info_df.head()

Unnamed: 0,hasID,hasTitle,hasDescription,hasNumberPage,hasPublicYear,hasExtension,hasSize,hasAuthor,hasPublisher,hasLanguage,hasRate,hasTags,hasCategory
0,4222673,Absolute Essentials of Ethereum,Absolute Essentials of Ethereum is a concise t...,108,2024,pdf,7 MB,[Paul Dylan-Ennis],Routledge,English,4.36,"[Economy, Mathematical Economics]",Math
1,3603445,Analysis and Linear Algebra: An Introduction f...,This elementary introduction was developed fro...,289,2023,pdf,10 MB,"[Thomas Holey, Armin Wiedemann]",Springer,English,3.14,"[Economy, Mathematical Economics]",Math
2,3642738,The Essentials of Financial Modeling in Excel:...,A concise and practical guide to financial mod...,286,2023,pdf,14 MB,[Michael Rees],Wiley,English,3.43,"[Economy, Mathematical Economics]",Math
3,3302220,Microeconomía heterodoxa: Modelos sraffianos y...,Reseña:Esta obra está pensada para servir de c...,184,2022,pdf,1006 kB,[Gabriel Montes Rojas],EUDEBA,Spanish,3.3,"[Economy, Mathematical Economics]",Math
4,3355539,Statistics for Business and Economics: Compend...,This 2nd edition compendium contains and expla...,324,2022,pdf,3 MB,[Franz W. Peren],Springer,English,3.11,"[Economy, Mathematical Economics]",Math
