In [1]:
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta

fake = Faker()

# -----------------------------
# CONFIG
NUM_STUDENTS = 5000
NUM_INTERNSHIPS = 5000
# -----------------------------

# -----------------------------
# 100 Professions
PROFESSIONS = [
    "Data Analyst Intern","Software Developer Intern","AI/ML Intern","Web Developer Intern","Cloud Engineer Intern",
    "Cybersecurity Intern","Blockchain Intern","UI/UX Designer Intern","IoT Intern","Business Analyst Intern",
    "Finance Intern","Marketing Intern","HR Intern","Operations Intern","Product Management Intern",
    "Quality Assurance Intern","Network Engineer Intern","Full Stack Developer Intern","Frontend Developer Intern","Backend Developer Intern",
    "DevOps Intern","Digital Marketing Intern","Graphic Designer Intern","Content Writer Intern","SEO Intern",
    "Social Media Manager Intern","Mobile App Developer Intern","Game Developer Intern","Data Scientist Intern","Research Intern",
    "UI Designer Intern","UX Research Intern","Electrical Engineer Intern","Mechanical Engineer Intern","Civil Engineer Intern",
    "Embedded Systems Intern","Robotics Intern","Cloud Solutions Architect Intern","Business Development Intern","Supply Chain Intern",
    "Logistics Intern","Legal Intern","Consulting Intern","Environmental Engineer Intern","Biomedical Engineer Intern",
    "Pharmaceutical Research Intern","E-Commerce Intern","Video Editor Intern","Animation Intern","Technical Writer Intern",
    "AI Ethics Intern","NLP Intern","Computer Vision Intern","Database Administrator Intern","Systems Analyst Intern",
    "Salesforce Intern","Tableau / PowerBI Intern","Market Research Intern","Brand Management Intern","Customer Support Intern",
    "Helpdesk Intern","Business Intelligence Intern","Data Engineering Intern","Risk Management Intern","Actuarial Intern",
    "Investment Banking Intern","Equity Research Intern","Taxation Intern","Accounts Intern","Event Management Intern",
    "Hospitality Intern","Tourism Intern","Travel Planning Intern","Media Intern","Journalism Intern",
    "PR Intern","Public Policy Intern","Government Affairs Intern","NGO / Social Work Intern","Sustainability Intern",
    "Renewable Energy Intern","AI Product Intern","SaaS Product Intern","Mobile UI/UX Intern","Interaction Design Intern",
    "Motion Graphics Intern","Video Production Intern","Sound Engineering Intern","Game Design Intern","Animation Production Intern",
    "Civil Planning Intern","Architecture Intern","Interior Design Intern","Fashion Design Intern","Textile Design Intern",
    "Photography Intern","AI Research Intern","Cloud Security Intern","IT Support Intern","Technical Project Management Intern"
]

# -----------------------------
# 100 Unique Skills
SKILLS_LIST = [
    "Python","Java","C++","SQL","Machine Learning","Deep Learning","Data Analysis","Excel","Tableau","PowerBI",
    "AWS","Azure","Docker","Kubernetes","Linux","Git","Networking","Cybersecurity","Penetration Testing","Blockchain",
    "Solidity","Smart Contracts","Ethereum","HTML","CSS","JavaScript","React","Node.js","UI/UX","Adobe XD","Figma",
    "Sketch","Design","Analytics","Finance","Accounting","Digital Marketing","SEO","Content Writing","Social Media",
    "Branding","Communication","Recruitment","Employee Engagement","Documentation","Operations","Project Management",
    "Embedded Systems","Robotics","Cloud Computing","DevOps","Game Development","Video Editing","Animation","Sound Engineering",
    "Research","Statistics","Probability","NLP","Computer Vision","Data Engineering","Business Intelligence","Marketing Research",
    "Customer Support","Helpdesk","Actuarial","Investment Banking","Equity Research","Taxation","Event Management",
    "Hospitality","Tourism","Travel Planning","Public Policy","Government Affairs","NGO Work","Sustainability",
    "Renewable Energy","Product Management","UI Design","UX Research","Interaction Design","Motion Graphics",
    "Video Production","Photography","Fashion Design","Textile Design","Civil Planning","Architecture","Interior Design",
    "Technical Writing","IT Support","Cloud Security"
]

# -----------------------------
# Map 100 professions to 5–10 relevant skills
PROFESSION_SKILLS = {}
for prof in PROFESSIONS:
    PROFESSION_SKILLS[prof] = random.sample(SKILLS_LIST, random.randint(5,10))

# -----------------------------
# Students Dataset Options
QUALIFICATIONS = ["B.Tech CSE","B.Tech IT","B.Tech ECE","B.Tech ME",
                  "B.Sc IT","B.Sc Physics","B.Sc Maths","MCA","M.Tech CS"]
LOCATIONS = ["Delhi","Bangalore","Mumbai","Hyderabad","Chennai","Kolkata","Pune","Jaipur","Lucknow","Remote"]
SECTORS = ["AI/ML","Web Development","Finance","Software","Data Science","Cloud Computing","Cybersecurity","UI/UX","Blockchain","IoT"]
SOCIAL_CATEGORY = ["General","OBC","SC","ST","EWS"]
JOB_TYPES = ["Remote","Full-time","Internship","Part-time"]

JOB_TYPES_INT = ["Full-time","Part-time","Remote","Internship"]
COMPANIES = [fake.company() for _ in range(200)]
STATES = ["Delhi","Karnataka","Maharashtra","Telangana","Tamil Nadu","West Bengal","Rajasthan","Uttar Pradesh"]
DIVERSITY_CATEGORIES = ["General","OBC","SC","ST","EWS"]

# -----------------------------
# Helper functions
def random_list(options,min_items=1,max_items=5):
    return random.sample(options,k=random.randint(min_items,min(max_items,len(options))))

def random_bool():
    return random.choice([True,False])

def random_date(start_days_ago=60,end_days_ago=0):
    start = datetime.now() - timedelta(days=start_days_ago)
    end = datetime.now() - timedelta(days=end_days_ago)
    return fake.date_between(start_date=start,end_date=end)

def random_experience():
    return random.randint(0,36)

# -----------------------------
# Generate Students
students=[]
for i in range(1,NUM_STUDENTS+1):
    prof=random.choice(PROFESSIONS)
    skills=random_list(PROFESSION_SKILLS[prof],2,min(10,len(PROFESSION_SKILLS[prof])))
    student={
        "Student_ID":f"S{i:05d}",
        "Name":fake.name(),
        "Email":fake.email(),
        "Phone":fake.msisdn()[0:10],
        "Skills":skills,
        "Qualification":random.choice(QUALIFICATIONS),
        "Location_Preferences":random_list(LOCATIONS,1,4),
        "Sector_Interests":random_list(SECTORS,1,3),
        "Social_Category":random.choice(SOCIAL_CATEGORY),
        "Past_Internship_Participation":random_bool(),
        "Preferred_Job_Type":random.choice(JOB_TYPES),
        "Experience":random_experience()
    }
    students.append(student)

student_df=pd.DataFrame(students)
student_df.to_csv("students_varied_5000_skills.csv",index=False)
print("5000+ Students dataset ready: students_varied_5000_skills.csv")

# -----------------------------
# Generate Internships
internships=[]
for i in range(1,NUM_INTERNSHIPS+1):
    prof=random.choice(PROFESSIONS)
    skills=random_list(PROFESSION_SKILLS[prof],2,min(10,len(PROFESSION_SKILLS[prof])))
    internship={
        "Job_ID":f"J{i:05d}",
        "Job_Title":prof,
        "Job_Type":random.choice(JOB_TYPES_INT),
        "Company_Name":random.choice(COMPANIES),
        "Sector":random.choice(SECTORS),
        "Posted_Date":random_date(),
        "Location":random.choice(LOCATIONS),
        "Cities":random_list(LOCATIONS,1,3),
        "States":random_list(STATES,1,2),
        "Skills_Required":skills,
        "Qualification_Required":random_list(QUALIFICATIONS,1,2),
        "Experience_Required":f"{random.randint(0,24)} months",
        "Stipend":random.randint(5000,50000),
        "Duration":f"{random.choice([1,2,3,6])} months",
        "Number_of_Openings":random.randint(1,10),
        "Actively_Hiring":random_bool(),
        "Description":fake.text(max_nb_chars=150),
        "Links":fake.url(),
        "Diversity_Preferences":random_list(DIVERSITY_CATEGORIES,1,2),
        "Past_Participation":random_bool(),
        "Tags":random_list(SKILLS_LIST+PROFESSIONS,2,5)
    }
    internships.append(internship)

internship_df=pd.DataFrame(internships)
internship_df.to_csv("internships_varied_5000_skills.csv",index=False)
print("5000+ Internships dataset ready: internships_varied_5000_skills.csv")


5000+ Students dataset ready: students_varied_5000_skills.csv
5000+ Internships dataset ready: internships_varied_5000_skills.csv


In [7]:
internship = pd.read_csv('internships_varied_5000_skills.csv')
student = pd.read_csv('students_varied_5000_skills.csv')

In [9]:
student.head()
student.columns

Index(['Student_ID', 'Name', 'Email', 'Phone', 'Skills', 'Qualification',
       'Location_Preferences', 'Sector_Interests', 'Social_Category',
       'Past_Internship_Participation', 'Preferred_Job_Type', 'Experience'],
      dtype='object')

In [12]:
internship.head()
# internship.columns

Unnamed: 0,Job_ID,Job_Title,Job_Type,Company_Name,Sector,Posted_Date,Location,Cities,States,Skills_Required,...,Experience_Required,Stipend,Duration,Number_of_Openings,Actively_Hiring,Description,Links,Diversity_Preferences,Past_Participation,Tags
0,J00001,Cloud Engineer Intern,Full-time,Powers-Shepherd,Cloud Computing,2025-09-12,Jaipur,"['Remote', 'Kolkata']",['West Bengal'],"['Smart Contracts', 'Azure', 'Cloud Security',...",...,12 months,15583,2 months,8,True,Marriage life source region experience us. Spo...,https://jones.com/,"['OBC', 'ST']",False,"['DevOps Intern', 'Motion Graphics Intern', 'S..."
1,J00002,Technical Writer Intern,Remote,"Harmon, Hall and Quinn",Data Science,2025-08-22,Chennai,"['Jaipur', 'Bangalore', 'Remote']",['Telangana'],"['Data Analysis', 'Public Policy', 'Textile De...",...,14 months,29117,1 months,8,False,Draw deep one thank doctor wife treatment rais...,https://brown.biz/,"['OBC', 'General']",True,"['Social Media Manager Intern', 'Blockchain In..."
2,J00003,Data Analyst Intern,Full-time,Gonzalez Inc,IoT,2025-08-25,Lucknow,['Remote'],['Delhi'],"['Excel', 'Project Management', 'Branding', 'A...",...,7 months,35518,6 months,1,True,Society movie play human before. Road event wa...,http://watson-smith.com/,['EWS'],False,"['UX Research Intern', 'Market Research Intern..."
3,J00004,UI/UX Designer Intern,Full-time,"Clark, Murray and Potter",Cloud Computing,2025-09-13,Mumbai,['Pune'],['Delhi'],"['Blockchain', 'Motion Graphics', 'Tourism', '...",...,18 months,23954,2 months,8,False,Happen appear candidate firm. Become factor am...,https://www.gonzalez.com/,"['SC', 'General']",False,"['Blockchain', 'Civil Planning', 'Animation In..."
4,J00005,Actuarial Intern,Part-time,"Craig, Martin and Krause",Finance,2025-09-05,Kolkata,['Lucknow'],['Rajasthan'],"['DevOps', 'Smart Contracts', 'Textile Design'...",...,9 months,33681,1 months,4,True,Assume society election themselves design outs...,https://rosales.com/,['General'],False,"['Mobile UI/UX Intern', 'Mechanical Engineer I..."


In [13]:
student.shape

(5000, 12)