## Step 1. Generate Raw Data

In [4]:
import pandas as pd
import random
from faker import Faker

# Initialize Faker
fake = Faker()
Faker.seed(42)
random.seed(42)

# Define constants
NUM_COURSES = 1000  # Adjust the number of courses
NUM_INSTRUCTORS = 200  # Number of unique instructors
PLATFORMS = ["Coursera", "Udemy", "edX", "Skillshare", "Khan Academy"]
CATEGORIES = ["Data Science", "AI", "Web Development", "Business", "Cloud Computing"]
COURSE_LEVELS = ["Beginner", "Intermediate", "Advanced"]
LANGUAGES = ["English", "Spanish", "French", "German", "Mandarin"]
SKILLS = ["Machine Learning", "SQL", "Python", "JavaScript", "Cloud Computing", "Data Analysis"]

# Generate Instructors
instructors = []
for _ in range(NUM_INSTRUCTORS):
    instructors.append({
        "instructor_id": fake.uuid4(),
        "instructor_name": fake.name(),
        "institution": random.choice(["MIT", "Stanford", "Harvard", "Independent", "Google", None]),  # Some missing values
        "experience_years": random.randint(1, 25),
        "email": fake.email()
    })
df_instructors = pd.DataFrame(instructors)
df_instructors.to_csv("raw_instructors.csv", index=False)

# Generate Courses
courses = []
for _ in range(NUM_COURSES):
    course_id = fake.uuid4()
    courses.append({
        "course_id": course_id,
        "course_name": fake.sentence(nb_words=4).replace(".", ""),  # Realistic course title
        "instructor_id": random.choice(df_instructors["instructor_id"].tolist()),
        "platform": random.choice(PLATFORMS),
        "category": random.choice(CATEGORIES),
        "level": random.choice(COURSE_LEVELS),
        "language": random.choice(LANGUAGES),
        "price": round(random.uniform(0, 499), 2),  # Some courses are free (0 price)
        "duration_hours": round(random.uniform(1, 60), 1),
        "lecture_count": random.randint(5, 100),
        "certificate_type": random.choice(["Certificate", "No Certificate", None]),  # Some missing values
        "course_url": fake.url(),
        "date_added": fake.date_between(start_date="-5y", end_date="today"),  # Randomized historical data
    })
df_courses = pd.DataFrame(courses)
df_courses.to_csv("raw_courses.csv", index=False)

# Generate Student Enrollments
enrollments = []
for _ in range(NUM_COURSES * 2):  # More enrollments than courses
    course_id = random.choice(df_courses["course_id"].tolist())
    enrolled_students = random.randint(50, 5000)
    completed_students = int(enrolled_students * random.uniform(0.2, 0.9))  # 20-90% completion
    enrollments.append({
        "enrollment_id": fake.uuid4(),
        "course_id": course_id,
        "student_id": fake.uuid4(),
        "student_name": fake.name(),
        "student_email": fake.email(),
        "enrolled_date": fake.date_between(start_date="-3y", end_date="today"),
        "completed": random.choice([True, False, None]),  # Some missing values
        "rating": round(random.uniform(2.5, 5.0), 2) if random.random() > 0.1 else None,  # Some missing ratings
        "review_text": fake.sentence(nb_words=random.randint(5, 20)) if random.random() > 0.7 else None,  # Some missing reviews
    })
df_enrollments = pd.DataFrame(enrollments)
df_enrollments.to_csv("raw_enrollments.csv", index=False)

# Generate Skills Mapping
skills_data = []
for _ in range(NUM_COURSES * 2):  # Some courses have multiple skills
    skills_data.append({
        "course_id": random.choice(df_courses["course_id"].tolist()),
        "skill_name": random.choice(SKILLS)
    })
df_skills = pd.DataFrame(skills_data)
df_skills.to_csv("raw_skills.csv", index=False)

print("✅ Realistic Mock Data Generated Successfully!")

✅ Realistic Mock Data Generated Successfully!
