In [1]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta
import os

fake = Faker()
Faker.seed(40)
random.seed(40)
np.random.seed(40)

output_dir = "data_dummy"
os.makedirs(output_dir, exist_ok=True)


In [2]:
domains = ["gmail.com", "yahoo.com", "rocket.com", "outlook.com", "starting.com", "hotmail.com"]
names = [fake.name() for i in range(1000)]
emails = [
        f"{name.split()[0].lower()}_{i}@{random.choice(domains)}"
        for i, name in enumerate(names, start=10)
]

users = pd.DataFrame({
    "user_id": range(1, 1001),
    "name": names,
    "email":emails,
    "registered_at": [fake.date_time_between(start_date='-2y', end_date='now') for i in range(1000)]
})
users.to_csv(f"{output_dir}/users.csv", index=False)

In [3]:
libraries = pd.DataFrame({
    "library_id": [f"lib-{str (i).zfill(2)}" for i in range(1,11)],
    "name": [f"{fake.city()} Library" for _ in range(10)],
    "address": [fake.address().replace("\n", ", ") for _ in range(10)]
})
libraries.to_csv(f"{output_dir}/libraries.csv", index=False)


In [4]:
category_names = ['Self-Improvement', 'Biography', 'Fantasy', 'Romance',
                  'Science Fiction', 'Crime', 'Horror', 'Arcade', 'Suspense',
                  'History', 'True Crime', 'Psychology', 'Travel',
                  'History', 'Self-Help', 'Memoir', 'Biography', 'Dystopian']

categories = pd.DataFrame({
    "category_id": [f"gnr-{str (i).zfill(2)}" for i in range (1, len(category_names)+1)],
    "name": category_names
})
categories.to_csv(f"{output_dir}/categories.csv", index=False)


In [5]:
books = pd.DataFrame({
    "book_id": [f"bk-{str (i).zfill(2)}" for i in range (1,501)],
    "title": [fake.sentence(nb_words=5).rstrip('.') for i in range(500)],
    "author": [fake.name() for _ in range(500)],
    "category_id": np.random.choice(categories['category_id'], size=500)
})
books.to_csv(f"{output_dir}/books.csv", index=False)


In [6]:
library_books = []
for book_id in books['book_id']:
    lib_sample = np.random.choice(libraries['library_id'], size=random.randint(1, 3), replace=False)
    for library_id in lib_sample:
        library_books.append({
            "library_id": library_id,
            "book_id": book_id,
            "quantity": random.randint(1, 10)
        })
library_books = pd.DataFrame(library_books)
library_books.to_csv(f"{output_dir}/library_books.csv", index=False)


In [None]:
valid_pairs = library_books[['library_id', 'book_id']].values.tolist()
loans = []
for i in range(1, 1001):
    user_id = random.randint(1, 1000)
    library_id, book_id = random.choice(valid_pairs)
    loan_date = fake.date_between(start_date='-1y', end_date='today')
    due_date = loan_date + timedelta(days=14)
    return_chance = random.random()
    if return_chance < 0.7:
        return_date = loan_date + timedelta(days=random.randint(1, 14))
        status = 'returned'
    else:
        return_date = None
        status = 'active'
    loans.append({
        "loan_id": i,
        "user_id": user_id,
        "book_id": book_id,
        "library_id": library_id,
        "loan_date": loan_date,
        "due_date": due_date,
        "return_date": return_date,
        "status": status
    })
loans = pd.DataFrame(loans)
loans.to_csv(f"{output_dir}/loans.csv", index=False)


In [8]:
holds = []
for i in range(1, 1001):
    user_id = random.randint(1, 1000)
    library_id, book_id = random.choice(valid_pairs)
    hold_date = fake.date_between(start_date='-6mo', end_date='today')
    expire_date = hold_date + timedelta(days=7)
    queue_position = random.randint(1, 5)
    fulfilled = random.choice([True, False])
    holds.append({
        "hold_id": i,
        "user_id": user_id,
        "book_id": book_id,
        "library_id": library_id,
        "hold_date": hold_date,
        "expire_date": expire_date,
        "queue_position": queue_position,
        "fulfilled": fulfilled
    })
holds = pd.DataFrame(holds)
holds.to_csv(f"{output_dir}/holds.csv", index=False)
