### <font color='#4285f4'>Cymbal Consumer Finance synthetic data generation</font>

This notebook generates synthetic data for the Cymbal Consumer Finance dataset.

### <font color='#4285f4'>package installation</font>

In [None]:
# PIP Installs (if necessary)
%load_ext google.cloud.bigquery




In [1]:
# Standard library imports
import os
import random
import sys
import uuid
from datetime import datetime, timedelta

# Third party imports
import numpy as np
import pandas as pd
from faker import Faker
from tqdm import tqdm

# Google Cloud imports
from google.cloud import bigquery
from google.cloud import storage
import google.cloud.bigquery as bigquery
import vertexai
from vertexai.generative_models import GenerativeModel

# PDF generation imports
from reportlab.lib.pagesizes import letter
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfgen import canvas

### <font color='#4285f4'>Initialize</font>

In [None]:
bigquery_location = "<TODO_DEVELOPER>"
region = "<TODO_DEVELOPER>"
location = "<TODO_DEVELOPER>"
gcs_bucket = "<TODO_DEVELOPER>"
bigquery_dataset = "<TODO_DEVELOPER>"

# Get the current date and time
now = datetime.now()

# Format the date and time as desired
formatted_date = now.strftime("%Y-%m-%d-%H-%M")

# Get some values using gcloud
project_id = !(gcloud config get-value project)
user = !(gcloud auth list --filter=status:ACTIVE --format="value(account)")

if len(project_id) != 1:
  raise RuntimeError(f"project_id is not set: {project_id}")
project_id = project_id[0]

if len(user) != 1:
  raise RuntimeError(f"user is not set: {user}")
user = user[0]

print(f"project_id = {project_id}")
print(f"user = {user}")

In [3]:
# Initialize Vertex AI
def init_vertex_ai():
    vertexai.init(project=project_id, location=location)
    return GenerativeModel("gemini-1.5-pro-002")

In [None]:
%%bigquery

DROP TABLE IF EXISTS cymbal_consumer_finance_ds.customers;
CREATE TABLE IF NOT EXISTS cymbal_consumer_finance_ds.customers (
    customer_id STRING NOT NULL,
    first_name STRING,
    last_name STRING,
    date_of_birth DATE,
    email STRING,
    phone_number STRING,
    creation_date DATE NOT NULL,
    life_event STRING
);

DROP TABLE IF EXISTS cymbal_consumer_finance_ds.loan_applications;
CREATE TABLE IF NOT EXISTS cymbal_consumer_finance_ds.loan_applications (
    application_id STRING NOT NULL,
    customer_id STRING NOT NULL,
    application_date DATE NOT NULL,
    product_type STRING NOT NULL,
    sub_product STRING NOT NULL,  
    loan_amount NUMERIC NOT NULL,
    description STRING,  
    application_status STRING NOT NULL,
    approval_date DATE,
    disbursement_date DATE,
    application_channel STRING,
    marketing_cost NUMERIC NOT NULL
);

DROP TABLE IF EXISTS cymbal_consumer_finance_ds.loan_repayments;
CREATE TABLE IF NOT EXISTS cymbal_consumer_finance_ds.loan_repayments (
    repayment_id STRING NOT NULL,
    loan_id STRING NOT NULL,
    repayment_date DATE NOT NULL,
    amount_due NUMERIC NOT NULL,
    amount_paid NUMERIC NOT NULL,
    payment_status STRING NOT NULL,
    days_past_due INT64
);



DROP TABLE IF EXISTS cymbal_consumer_finance_ds.marketing_costs;
CREATE TABLE IF NOT EXISTS cymbal_consumer_finance_ds.marketing_costs (
    cost_id STRING NOT NULL,
    channel STRING NOT NULL,
    product_type STRING NOT NULL,
    cost_per_lead NUMERIC NOT NULL
);

ALTER TABLE cymbal_consumer_finance_ds.loan_applications ADD PRIMARY KEY(application_id) NOT ENFORCED;
ALTER TABLE cymbal_consumer_finance_ds.loan_repayments ADD PRIMARY KEY(repayment_id) NOT ENFORCED;
ALTER TABLE cymbal_consumer_finance_ds.customers ADD PRIMARY KEY(customer_id) NOT ENFORCED;
ALTER TABLE cymbal_consumer_finance_ds.marketing_costs ADD PRIMARY KEY(cost_id) NOT ENFORCED;

ALTER TABLE cymbal_consumer_finance_ds.loan_applications ADD CONSTRAINT fk_customer
    FOREIGN KEY(customer_id) REFERENCES cymbal_consumer_finance_ds.customers(customer_id) NOT ENFORCED;

ALTER TABLE cymbal_consumer_finance_ds.loan_repayments ADD CONSTRAINT fk_loan
    FOREIGN KEY(loan_id) REFERENCES cymbal_consumer_finance_ds.loan_applications(application_id) NOT ENFORCED;

DROP TABLE IF EXISTS cymbal_consumer_finance_ds.loan_applications_documents;
CREATE OR REPLACE EXTERNAL TABLE cymbal_consumer_finance_ds.loan_applications_documents
WITH CONNECTION `velascoluis-dev-sandbox.us-central1.data_access`
OPTIONS(
  object_metadata = 'SIMPLE',
  uris = ['gs://velascoluis-dev-sandbox-bucket/loan_applications_documents']);    

In [None]:
# Initialize Faker
fake = Faker()
np.random.seed(42)


# Generate customers data (e.g., 1 customers)
def generate_customers(num_records=10000):
    customers = []
    life_events = [
        "Marriage", "New Job", "Relocation", "Retirement", "New Child",
        "Graduation", "Home Purchase", "Starting Business", "Divorce",
        "Career Change", "Empty Nest", "Medical Event", "Inheritance",
        None
    ]
    probabilities = [0.07] * 13 + [0.09]  # 13 events * 0.07 + 0.09 = 1.0
    for _ in tqdm(range(num_records), desc="Generating customers"):
        customers.append(
            {
                "customer_id": str(uuid.uuid4()),
                "first_name": fake.first_name(),
                "last_name": fake.last_name(),
                "date_of_birth": fake.date_of_birth(minimum_age=18, maximum_age=90),
                "email": fake.email(),
                "phone_number": fake.phone_number(),
                "creation_date": fake.date_between(start_date="-5y"),
                "life_event": np.random.choice(
                    life_events, p=probabilities
                ),
            }
        )

    return pd.DataFrame(customers)


# Generate marketing costs data
def generate_marketing_costs():
    channels = ["Social Media", "Email", "Search", "Display", "Partner"]
    product_types = ["Personal Loan", "Auto Loan", "Home Loan", "Credit Card", "Business Loan", "Student Loan", "Debt Consolidation Loan", "Home Equity Loan"]

    marketing_costs = []
    for channel in tqdm(channels, desc="Generating marketing costs"):
        for product in product_types:
            marketing_costs.append(
                {
                    "cost_id": str(uuid.uuid4()),
                    "channel": channel,
                    "product_type": product,
                    "cost_per_lead": round(np.random.uniform(50, 500), 2),
                }
            )

    return pd.DataFrame(marketing_costs)


def generate_loan_description(model, product_type, sub_product, amount):
    prompt = f"""
    Generate a brief, natural description (2-3 sentences) for a loan application with these details:
    - Product Type: {product_type}
    - Sub-product: {sub_product}
    - Amount: ${amount:,.2f}
    
    The description should explain the purpose and context of the loan request from the customer's perspective.
    Keep it professional but conversational.
    """

    try:
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        print(f"Error generating description: {e}")
        return f"Application for {sub_product} {product_type} of ${amount:,.2f}."


# Generate loan applications
def generate_loan_applications(customers_df, marketing_costs_df, num_applications=500):
    applications = []
    product_types = marketing_costs_df["product_type"].unique()
    channels = marketing_costs_df["channel"].unique()
    statuses = ["Approved", "Rejected", "Pending"]

    # Define sub-product mapping
    sub_products = {
        "Personal Loan": [
            "Debt Consolidation",
            "Home Improvement",
            "Medical Expenses",
            "Wedding",
            "Vacation",
        ],
        "Auto Loan": ["New Vehicle", "Used Vehicle", "Refinance", "Lease Buyout"],
        "Home Loan": ["Purchase", "Refinance", "Construction", "FHA", "VA"],
        "Credit Card": ["Rewards", "Cash Back", "Travel", "Business", "Student"],
        "Business Loan": [
            "Working Capital",
            "Equipment",
            "Expansion",
            "Inventory",
            "Startup",
        ],
        "Student Loan": ["Undergraduate", "Graduate", "Professional", "Refinance"],
        "Debt Consolidation Loan": [
            "Credit Card",
            "Personal Loans",
            "Medical Debt",
            "Mixed Debt",
        ],
        "Home Equity Loan": ["Fixed Rate", "HELOC", "Cash Out", "Home Improvement"],
    }
    model = init_vertex_ai()
    for _ in tqdm(range(num_applications), desc="Generating loan applications"):
        product_type = np.random.choice(product_types)
        sub_product = np.random.choice(sub_products[product_type])
        status = np.random.choice(statuses, p=[0.7, 0.2, 0.1])
        application_date = fake.date_between(start_date="-1y")
        loan_amount = round(np.random.uniform(1000, 100000), 2)
        description = generate_loan_description(model, product_type, sub_product, loan_amount)
        application = {
            "application_id": str(uuid.uuid4()),
            "customer_id": np.random.choice(customers_df["customer_id"]),
            "application_date": application_date,
            "product_type": product_type,
            "sub_product": sub_product,
            "loan_amount": loan_amount,
            "description": description,  # Changed from loan_description to match schema
            "application_status": status,
            "approval_date": (
                (application_date + timedelta(days=np.random.randint(1, 30)))
                if status == "Approved"
                else None
            ),
            "disbursement_date": (
                (application_date + timedelta(days=np.random.randint(31, 45)))
                if status == "Approved"
                else None
            ),
            "application_channel": np.random.choice(channels),
            "marketing_cost": round(np.random.uniform(50, 500), 2),
        }

        applications.append(application)

    return pd.DataFrame(applications)[
        [
            "application_id",
            "customer_id",
            "application_date",
            "product_type",
            "sub_product",
            "loan_amount",
            "description",
            "application_status",
            "approval_date",
            "disbursement_date",
            "application_channel",
            "marketing_cost",
        ]
    ]


# Generate loan repayments
def generate_loan_repayments(loan_applications_df):
    repayments = []

    # Filter only approved loans
    approved_loans = loan_applications_df[
        loan_applications_df["application_status"] == "Approved"
    ]

    for _, loan in tqdm(approved_loans.iterrows(), desc="Generating loan repayments", total=len(approved_loans)):
        # Generate 12 monthly payments for each loan
        for i in range(12):
            payment_date = loan["disbursement_date"] + timedelta(days=(i + 1) * 30)
            amount_due = round(loan["loan_amount"] / 12, 2)

            # Simulate different payment behaviors
            payment_status = np.random.choice(
                ["Paid", "Late", "Defaulted"], p=[0.8, 0.15, 0.05]
            )
            if payment_status == "Paid":
                amount_paid = amount_due
                days_past_due = 0
            elif payment_status == "Late":
                amount_paid = (
                    amount_due
                    if np.random.random() > 0.5
                    else round(amount_due * 0.5, 2)
                )
                days_past_due = np.random.randint(1, 30)
            else:  # Defaulted
                amount_paid = 0
                days_past_due = np.random.randint(30, 90)

            repayments.append(
                {
                    "repayment_id": str(uuid.uuid4()),
                    "loan_id": loan["application_id"],
                    "repayment_date": payment_date,
                    "amount_due": amount_due,
                    "amount_paid": amount_paid,
                    "payment_status": payment_status,
                    "days_past_due": days_past_due,
                }
            )

    return pd.DataFrame(repayments)


print("Generating data...")
customers_df = generate_customers()
marketing_costs_df = generate_marketing_costs()
loan_applications_df = generate_loan_applications(customers_df, marketing_costs_df)
loan_repayments_df = generate_loan_repayments(loan_applications_df)

print("Converting dates...")
# Before saving, convert date columns to datetime
customers_df["date_of_birth"] = pd.to_datetime(customers_df["date_of_birth"])
customers_df["creation_date"] = pd.to_datetime(customers_df["creation_date"])

loan_applications_df["application_date"] = pd.to_datetime(
    loan_applications_df["application_date"]
)
loan_applications_df["approval_date"] = pd.to_datetime(
    loan_applications_df["approval_date"]
)
loan_applications_df["disbursement_date"] = pd.to_datetime(
    loan_applications_df["disbursement_date"]
)

loan_repayments_df["repayment_date"] = pd.to_datetime(
    loan_repayments_df["repayment_date"]
)

# Save dataframes to CSV
print("Saving data to CSV...")
customers_df.to_csv("customers.csv", index=False)
marketing_costs_df.to_csv("marketing_costs.csv", index=False)
loan_applications_df.to_csv("loan_applications.csv", index=False)
loan_repayments_df.to_csv("loan_repayments.csv", index=False)

# Initialize BigQuery client
client = bigquery.Client(project=project_id)

# Load CSV files to BigQuery
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",
    source_format=bigquery.SourceFormat.CSV,
    skip_leading_rows=1
)

print("Loading data to BigQuery...")
print("Loading customers...")
with open("customers.csv", "rb") as source_file:
    job = client.load_table_from_file(
        source_file,
        f"{project_id}.cymbal_consumer_finance_ds.customers",
        job_config=job_config
    )
    job.result()

print("Loading marketing costs...")
with open("marketing_costs.csv", "rb") as source_file:
    job = client.load_table_from_file(
        source_file,
        f"{project_id}.cymbal_consumer_finance_ds.marketing_costs",
        job_config=job_config
    )
    job.result()

print("Loading loan applications...")
with open("loan_applications.csv", "rb") as source_file:
    job = client.load_table_from_file(
        source_file,
        f"{project_id}.cymbal_consumer_finance_ds.loan_applications",
        job_config=job_config
    )
    job.result()

print("Loading loan repayments...")
with open("loan_repayments.csv", "rb") as source_file:
    job = client.load_table_from_file(
        source_file,
        f"{project_id}.cymbal_consumer_finance_ds.loan_repayments",
        job_config=job_config
    )
    job.result()

In [None]:
def _get_data():
    client = bigquery.Client()
    sql = f"""
           SELECT 
             a.application_id,
             a.customer_id,
             c.first_name,
             c.last_name,
             c.email,
             c.phone_number,
             a.application_date,
             a.product_type,
             a.sub_product,
             a.loan_amount,
             a.description,
             a.application_status,
             a.approval_date,
             a.disbursement_date,
             a.application_channel,
             a.marketing_cost
           FROM `{project_id}.cymbal_consumer_finance_ds.loan_applications` a
           JOIN `{project_id}.cymbal_consumer_finance_ds.customers` c
           ON a.customer_id = c.customer_id
        """
    df = client.query_and_wait(sql).to_dataframe()
    return df


def download_fonts():
    """Download required fonts from Google Fonts"""
    import requests
    import os

    # Create fonts directory if it doesn't exist
    if not os.path.exists("fonts"):
        os.makedirs("fonts")

    # Font URLs from Google Fonts
    font_urls = {
        "Caveat-Regular.ttf": "https://github.com/googlefonts/caveat/raw/main/fonts/ttf/Caveat-Regular.ttf",
        "HomemadeApple-Regular.ttf": "https://github.com/google/fonts/raw/main/apache/homemadeapple/HomemadeApple-Regular.ttf",
        "Kalam-Regular.ttf": "https://github.com/google/fonts/raw/main/ofl/kalam/Kalam-Regular.ttf",
        "Satisfy-Regular.ttf": "https://github.com/google/fonts/raw/main/apache/satisfy/Satisfy-Regular.ttf",
    }

    for font_name, url in font_urls.items():
        font_path = f"fonts/{font_name}"
        if not os.path.exists(font_path):
            print(f"Downloading {font_name}...")
            response = requests.get(url)
            with open(font_path, "wb") as f:
                f.write(response.content)
            print(f"Downloaded {font_name}")


def gen_pdfs():
    download_fonts()
    df = _get_data()
    storage_client = storage.Client()
    bucket = storage_client.bucket(gcs_bucket)

    # Register handwriting fonts
    handwriting_fonts = [
        "Caveat-Regular",
        "HomemadeApple-Regular",
        "Kalam-Regular",
        "Satisfy-Regular",
    ]
    for font_name in handwriting_fonts:
        font_path = f"fonts/{font_name}.ttf"
        pdfmetrics.registerFont(TTFont(font_name, font_path))

    for _, row in tqdm(df.iterrows(), desc="Generating PDFs", total=len(df)):
        file_name = f"loan_application_{row['application_id']}.pdf"
        c = canvas.Canvas(file_name, pagesize=letter)

        # Header
        c.setFont("Helvetica", 12)
        c.drawString(72, 770, "Cymbal Consumer Finance")
        c.setFont("Helvetica-Bold", 24)
        c.drawString(72, 750, "LOAN APPLICATION FORM")

        y = 700

        # Personal Information Section
        c.setFont("Helvetica-Bold", 14)
        c.drawString(72, y, "Personal Information")
        c.setFont("Helvetica", 10)
        y -= 30

        def draw_field(label, value, y_pos):
            c.setFont("Helvetica", 10)
            c.drawString(72, y_pos, label)

            # Draw input box
            c.rect(72, y_pos - 25, 400, 20)

            # Draw value in handwriting font
            c.setFont(random.choice(handwriting_fonts), random.uniform(11, 13))
            c.drawString(82, y_pos - 20, str(value))

            return y_pos - 40

        # Personal Information Fields
        y = draw_field("Full Name:", f"{row['first_name']} {row['last_name']}", y)
        y = draw_field("Email:", row["email"], y)
        y = draw_field("Phone Number:", row["phone_number"], y)
       

        # Application Details Section
        y -= 10
        c.setFont("Helvetica-Bold", 14)
        c.drawString(72, y, "Application Details")
        c.setFont("Helvetica", 10)
        y -= 30

       
        y = draw_field("Application Date:", str(row["application_date"]), y)
        y = draw_field("Product Type:", row["product_type"], y)
        y = draw_field("Sub-product:", row["sub_product"], y)
        y = draw_field("Requested Amount:", f"${row['loan_amount']:,.2f}", y)

        # Description Section
        y -= 10
        c.setFont("Helvetica-Bold", 12)
        c.drawString(72, y, "Loan Description:")
        y -= 20

        # Word wrap for description
        description = row["description"]
        c.setFont(random.choice(handwriting_fonts), random.uniform(11, 13))
        words = description.split()
        line = []
        y_offset = y
        for word in words:
            line.append(word)
            text = " ".join(line)
            if c.stringWidth(text) > 400:  # Max width of 400 points
                line.pop()
                c.drawString(82, y_offset, " ".join(line))
                y_offset -= 20
                line = [word]
        if line:
            c.drawString(82, y_offset, " ".join(line))

        y = y_offset - 30  # Add padding after description

        y = draw_field("Application Channel:", row["application_channel"], y)

    

        # Draw border
        c.rect(50, 50, 512, 742)

        # Footer
        c.setFont("Helvetica", 8)
        text = "INTERNAL USE ONLY - CONFIDENTIAL"
        text_width = c.stringWidth(text)
        page_width = 512
        x = 50 + (page_width - text_width) / 2
        c.drawString(x, 70, text)

        c.save()

        # Upload to GCS
        blob = bucket.blob(f"loan_applications_documents/{file_name}")
        blob.upload_from_filename(file_name)
        #os.remove(file_name)


gen_pdfs()