I am using this code to generate a dataset with 1000 points for my project
(Dated - 23/07.2025)

In [9]:
import pandas as pd
import random
from faker import Faker
from datetime import timedelta

# Initialize Faker and seed for reproducibility
fake = Faker()
Faker.seed(42)
random.seed(42)

# Parameters
n_candidates = 1000
roles = ['Analyst', 'Engineer', 'Designer', 'Manager']
locations = ['Hyderabad', 'New York', 'London', 'Singapore']
sources = ['Referral', 'Job Portal', 'Campus', 'Internal']
statuses = ['Applied', 'Shortlisted', 'Interviewed', 'Offered', 'Accepted', 'Joined']
genders = ['Male', 'Female', 'Non-Binary']
gender_weights = [0.45, 0.45, 0.10]

# Helper function to simulate status progression
def simulate_status():
    end_index = random.choices(
        population=[3, 4, 5, 6],
        weights=[0.2, 0.3, 0.3, 0.2],  # dropout distribution
        k=1
    )[0]
    return statuses[:end_index]

# Generate data
data = []
for _ in range(n_candidates):
    candidate_id = fake.uuid4()
    role = random.choice(roles)
    location = random.choice(locations)
    source = random.choice(sources)
    gender = random.choices(genders, weights=gender_weights, k=1)[0]
    age = random.randint(22, 45)

    applied_date = fake.date_between(start_date='-90d', end_date='-60d')
    status_flow = simulate_status()
    current_status = status_flow[-1]

    offer_date = None
    join_date = None

    if 'Offered' in status_flow:
        offer_date = applied_date + timedelta(days=random.randint(10, 20))
    if 'Joined' in status_flow and offer_date:
        join_date = offer_date + timedelta(days=random.randint(7, 15))

    data.append({
        'Candidate_ID': candidate_id,
        'Role': role,
        'Location': location,
        'Source': source,
        'Gender': gender,
        'Age': age,
        'Current_Status': current_status,
        'Applied_Date': applied_date,
        'Offer_Date': offer_date,
        'Join_Date': join_date
    })

# Convert to DataFrame
df_recruitment = pd.DataFrame(data)

# Save to CSV
df_recruitment.to_csv("recruitment_data.csv", index=False)

print("Dataset generated and saved as recruitment_data.csv")


Dataset generated and saved as recruitment_data.csv
