In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import os

In [2]:
fake = Faker()

# Number of records
num_users = 200
num_courses = 30
num_assignments = 100
num_modules = 200

In [3]:
def generate_users():
    users = []
    for _ in range(num_users):
        users.append({
            'user_id': fake.uuid4(),
            'name': fake.name(),
            'email': fake.email(),
            'designation': fake.random_element([
                'Web Developer', 'Data Engineer', 'Data Scientist', 
                'AI Specialist', 'DevOps Engineer', 'Cybersecurity Specialist', 
                'Mobile Developer', 'UI/UX Designer', 'Software Tester']),
            'created_at': fake.date_time_this_year(),
            
        })
    return pd.DataFrame(users)

In [4]:
def generate_courses():
    courses = []
    for _ in range(num_courses):
        courses.append({
            'course_id': fake.uuid4(),
            'title': fake.catch_phrase(),
            'description': fake.text(max_nb_chars=200),
            'duration': fake.random_int(min=50, max=180),  
            'modules': fake.random_int(min=1, max=10),
            'created_at': fake.date_time_this_year(),
            'tag': fake.random_element([
                'Web Development', 'Data Engineering', 'Data Science', 
                'Generative AI', 'DevOps', 'Cybersecurity', 
                'Mobile Development', 'UI/UX Design', 'Software Testing'])
        })
    return pd.DataFrame(courses)

In [5]:
def generate_course_assignments(users, courses):
    assignments = []
    for _ in range(num_assignments):
        assignments.append({
            'assignment_id': fake.uuid4(),
            'employee_id': fake.random_element(users['user_id']),
            'course_id': fake.random_element(courses['course_id']),
            'assigned_at': fake.date_time_this_year(),
        })
    return pd.DataFrame(assignments)


In [6]:
def generate_course_modules(courses):
    modules = []
    for _ in range(num_modules):
        modules.append({
            'module_id': fake.uuid4(),
            'course_id': fake.random_element(courses['course_id']),
            'module_title': fake.catch_phrase(),
            'module_content': fake.text(max_nb_chars=300),
            'module_duration': fake.random_int(min=10, max=30), 
            'created_at': fake.date_time_this_year(),
        })
    return pd.DataFrame(modules)

In [7]:
def generate_course_progress(users, assignments):
    progress = []
    for _, assignment in assignments.iterrows():
        completion_percentage = fake.random_int(min=0, max=100)
        score = fake.random_int(min=20, max=100)  
        progress.append({
            'progress_id': fake.uuid4(),
            'employee_id': assignment['employee_id'],
            'course_id': assignment['course_id'],
            'completion_percentage': completion_percentage,
            'score': score,  
            'completed_at': fake.date_time_this_year() if completion_percentage == 100 else None,
        })
    return pd.DataFrame(progress)

In [8]:
def generate_module_progress(users, courses, modules):
    module_progress = []
    for user_id in users['user_id']:
        for module_id in modules['module_id']:
            is_completed = fake.boolean()
            module_progress.append({
                'module_progress_id': fake.uuid4(),
                'employee_id': user_id,
                'course_id': fake.random_element(courses['course_id']),
                'module_id': module_id,
                'is_completed': is_completed,
                'completed_at': fake.date_time_this_year() if is_completed else None,
            })
    return pd.DataFrame(module_progress)

In [9]:
os.makedirs("data_warehouse/raw", exist_ok=True)
os.makedirs("data_warehouse/staging", exist_ok=True)
os.makedirs("data_warehouse/report", exist_ok=True)

In [10]:
users_df = generate_users()
courses_df = generate_courses()
assignments_df = generate_course_assignments(users_df, courses_df)
modules_df = generate_course_modules(courses_df)
progress_df = generate_course_progress(users_df, assignments_df)
module_progress_df = generate_module_progress(users_df, courses_df, modules_df)

In [11]:
users_df.to_csv("data_warehouse/raw/users.csv", index=False)
courses_df.to_csv("data_warehouse/raw/courses.csv", index=False)
assignments_df.to_csv("data_warehouse/raw/course_assignments.csv", index=False)
modules_df.to_csv("data_warehouse/raw/course_modules.csv", index=False)
progress_df.to_csv("data_warehouse/raw/course_progress.csv", index=False)
module_progress_df.to_csv("data_warehouse/raw/module_progress.csv", index=False)