In [14]:
import pandas as pd 

df = pd.read_csv('aijobs_jobs_scrapping.csv')

In [15]:
import re


def split_basic_info(text):
    if not isinstance(text, str):
        return pd.Series([None, None, None, None])

    brackets = re.findall(r'\[(.*?)\]', text)

    extra_info = re.sub(r'\[.*?\]', '', text).strip()

    found_match = re.search(r'(Found.*ago)', extra_info)
    found = found_match.group(1) if found_match else None

    if found:
        extra_info = extra_info.replace(found, '').strip()

    while len(brackets) < 3:
        brackets.append(None)

    return pd.Series([brackets[0], brackets[1], brackets[2], extra_info, found])

df[['seniority', 'yoe', 'employment_type', 'extra_info', 'found_when']] = df['basic info'].apply(split_basic_info)

df[['basic info', 'seniority', 'yoe', 'employment_type', 'extra_info', 'found_when']].head()


Unnamed: 0,basic info,seniority,yoe,employment_type,extra_info,found_when
0,,,,,,
1,[Entry-level / Junior]\n[Internship]\nFound 22...,Entry-level / Junior,Internship,,,Found 22h ago
2,,,,,,
3,[Mid-level / Intermediate] [~1yoe]\n[Full Time...,Mid-level / Intermediate,~1yoe,Full Time,,Found 1d ago
4,,,,,,


In [16]:
df.dropna(inplace=True)
df.drop('basic info', axis=1, inplace=True)
df.head()

Unnamed: 0,topic,job_title,site,tasks,Perks/Benefits,Skills/Tech-stack required,Educational requirements,seniority,yoe,employment_type,extra_info,found_when
5,Artificial Intelligence,Senior Software Engineer,"Redmond, Washington, United States\n@ ...",['* Collaborate with stakeholders to determine...,"['+ Benefits', '+ Bonus', '+ Equity']",[AI/ML][Audio Processing][C#][C++][Data Compre...,[Bachelor's Degree][Master's Degree][PhD],Senior level / Expert,~4yoe,Full Time,USD 119K-258K,Found 1d ago
7,Artificial Intelligence,"Software Engineer, AI Foundations","New Taipei, Banqiao District, New Taipei City,...",['* Collaborate across teams to influence road...,"['+ Paid time off', '+ Performance bonus', '+ ...",[Big Data][Big data processing][C++][Communica...,[Bachelor's Degree][Master's Degree],Mid-level / Intermediate,~2yoe,Full Time,,Found 1d ago
9,Artificial Intelligence,AI Agent Developer,"Manila, Metro Manila, Philippines\n@ ...","['* Collaborate with cross-functional teams', ...",[],[Azure AI][Azure Cognitive][Azure Cognitive Se...,[Bachelor's Degree][Master's Degree],Senior level / Expert,~2yoe,Full Time,,Found 1d ago
11,Artificial Intelligence,Forward Deployed AI Engineer - Enterprise,"London, UK, San Francisco & Palo Alto, CA\n@ ...","['* Add logging and metrics', '* Analyze logs ...","['+ Career growth resources', '+ Diverse exper...",[AI ecosystem][AI ecosystem knowledge][Artific...,[Bachelor's Degree][Master's Degree],Senior level / Expert,~6yoe,Full Time,USD 180K-440K,Found 1d ago
13,Artificial Intelligence,Project Lead-App Development,"INDIA - NOIDA- BIRLASOFT OFFICE, IN\n@ ...",['* Collaborate with teams to translate busine...,"['+ Development programs', '+ Educational supp...",[AI Governance][AWS][Azure][Databricks][Embedd...,[Bachelor][Master],Senior level / Expert,~3yoe,Full Time,,Found 1d ago


In [17]:
df.head()

Unnamed: 0,topic,job_title,site,tasks,Perks/Benefits,Skills/Tech-stack required,Educational requirements,seniority,yoe,employment_type,extra_info,found_when
5,Artificial Intelligence,Senior Software Engineer,"Redmond, Washington, United States\n@ ...",['* Collaborate with stakeholders to determine...,"['+ Benefits', '+ Bonus', '+ Equity']",[AI/ML][Audio Processing][C#][C++][Data Compre...,[Bachelor's Degree][Master's Degree][PhD],Senior level / Expert,~4yoe,Full Time,USD 119K-258K,Found 1d ago
7,Artificial Intelligence,"Software Engineer, AI Foundations","New Taipei, Banqiao District, New Taipei City,...",['* Collaborate across teams to influence road...,"['+ Paid time off', '+ Performance bonus', '+ ...",[Big Data][Big data processing][C++][Communica...,[Bachelor's Degree][Master's Degree],Mid-level / Intermediate,~2yoe,Full Time,,Found 1d ago
9,Artificial Intelligence,AI Agent Developer,"Manila, Metro Manila, Philippines\n@ ...","['* Collaborate with cross-functional teams', ...",[],[Azure AI][Azure Cognitive][Azure Cognitive Se...,[Bachelor's Degree][Master's Degree],Senior level / Expert,~2yoe,Full Time,,Found 1d ago
11,Artificial Intelligence,Forward Deployed AI Engineer - Enterprise,"London, UK, San Francisco & Palo Alto, CA\n@ ...","['* Add logging and metrics', '* Analyze logs ...","['+ Career growth resources', '+ Diverse exper...",[AI ecosystem][AI ecosystem knowledge][Artific...,[Bachelor's Degree][Master's Degree],Senior level / Expert,~6yoe,Full Time,USD 180K-440K,Found 1d ago
13,Artificial Intelligence,Project Lead-App Development,"INDIA - NOIDA- BIRLASOFT OFFICE, IN\n@ ...",['* Collaborate with teams to translate busine...,"['+ Development programs', '+ Educational supp...",[AI Governance][AWS][Azure][Databricks][Embedd...,[Bachelor][Master],Senior level / Expert,~3yoe,Full Time,,Found 1d ago


In [18]:
import re
import ast
from datetime import timedelta
from dateutil import tz

# 1) site: keep only text before the first newline
df["site"] = df["site"].astype(str).str.split("\n").str[0].str.strip()

# --- helpers for list-like columns (Tasks, Perks/Benefits) ---
bullet_re = re.compile(r"^\s*[\*\-•\+]\s*") 

def _strip_bullet(s: str) -> str:
    return bullet_re.sub("", s).strip()

def normalize_listish(col):
    out = []
    for v in df[col].fillna(""):
        if isinstance(v, list):
            items = v
        elif isinstance(v, str):
            s = v.strip()
            # Try to parse "['* a', '* b']"
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, list):
                    items = parsed
                else:
                    items = [s]
            except Exception:
                # Fallback: split on commas if looks like [ ... ]
                if s.startswith("[") and s.endswith("]"):
                    s2 = s[1:-1]
                    items = [x.strip().strip("'").strip('"') for x in s2.split(",") if x.strip()]
                else:
                    # Already newline-separated or single string
                    items = [x for x in s.split("\n") if x.strip()]
        else:
            items = []

        # Clean each item: drop bullets, plus signs, and surrounding quotes/brackets
        cleaned = [_strip_bullet(str(it).strip().strip("'").strip('"')) for it in items if str(it).strip()]
        out.append("\n".join(cleaned))
    return out

# 2) Tasks: phrases separated by \n, remove bullets/symbols
df["tasks"] = normalize_listish("tasks")

# 3) Perks/Benefits: same treatment
if "Perks/Benefits" in df.columns:
    df["Perks/Benefits"] = normalize_listish("Perks/Benefits")

# 4) yoe: from "~4yoe" -> 4
df["yoe"] = (
    df["yoe"]
    .astype(str)
    .str.extract(r"(\d+)", expand=False)
    .astype("float").astype("Int64")   # keep as nullable int
)

# 5) found_when: "Found 1d ago" -> actual DATE (local to Bogotá)
def found_to_date(s: str):
    if not isinstance(s, str):
        return pd.NaT
    m = re.search(r"Found\s+(\d+)\s*([hdw])\s*ago", s, flags=re.I)
    if not m:
        return pd.NaT
    qty = int(m.group(1))
    unit = m.group(2).lower()
    mult = {"h": 1, "d": 24, "w": 24*7}[unit]
    tz_bog = tz.gettz("America/Bogota")
    now_local = pd.Timestamp.now(tz=tz_bog)
    dt = now_local - timedelta(hours=qty * mult)
    return dt.date()  # date-only; use dt.tz_convert(None) if you want full timestamp

df["created_at"] = df["found_when"].apply(found_to_date)
df.head()

# (Optional) if you prefer a Timestamp (at local midnight):
# df["found_timestamp"] = pd.to_datetime(df["found_date"])


Unnamed: 0,topic,job_title,site,tasks,Perks/Benefits,Skills/Tech-stack required,Educational requirements,seniority,yoe,employment_type,extra_info,found_when,created_at
5,Artificial Intelligence,Senior Software Engineer,"Redmond, Washington, United States",Collaborate with stakeholders to determine use...,Benefits\nBonus\nEquity,[AI/ML][Audio Processing][C#][C++][Data Compre...,[Bachelor's Degree][Master's Degree][PhD],Senior level / Expert,4,Full Time,USD 119K-258K,Found 1d ago,2025-08-08
7,Artificial Intelligence,"Software Engineer, AI Foundations","New Taipei, Banqiao District, New Taipei City,...",Collaborate across teams to influence roadmaps...,Paid time off\nPerformance bonus\nTraining & d...,[Big Data][Big data processing][C++][Communica...,[Bachelor's Degree][Master's Degree],Mid-level / Intermediate,2,Full Time,,Found 1d ago,2025-08-08
9,Artificial Intelligence,AI Agent Developer,"Manila, Metro Manila, Philippines",Collaborate with cross-functional teams\nDesig...,,[Azure AI][Azure Cognitive][Azure Cognitive Se...,[Bachelor's Degree][Master's Degree],Senior level / Expert,2,Full Time,,Found 1d ago,2025-08-08
11,Artificial Intelligence,Forward Deployed AI Engineer - Enterprise,"London, UK, San Francisco & Palo Alto, CA",Add logging and metrics\nAnalyze logs and prom...,Career growth resources\nDiverse experiences e...,[AI ecosystem][AI ecosystem knowledge][Artific...,[Bachelor's Degree][Master's Degree],Senior level / Expert,6,Full Time,USD 180K-440K,Found 1d ago,2025-08-08
13,Artificial Intelligence,Project Lead-App Development,"INDIA - NOIDA- BIRLASOFT OFFICE, IN",Collaborate with teams to translate business n...,Development programs\nEducational support\nFle...,[AI Governance][AWS][Azure][Databricks][Embedd...,[Bachelor][Master],Senior level / Expert,3,Full Time,,Found 1d ago,2025-08-08


In [19]:
def normalize_skills(col):
    out = []
    for v in df[col].fillna(""):
        if isinstance(v, list):
            skills = [str(s).strip() for s in v if str(s).strip()]
        else:
            # Extract text inside square brackets
            skills = re.findall(r"\[([^\]]+)\]", str(v))
        out.append(", ".join(skills))
    return out

df["Skills/Tech-stack required"] = normalize_skills("Skills/Tech-stack required")
df["Educational requirements"] = normalize_skills("Educational requirements")
df.head()

Unnamed: 0,topic,job_title,site,tasks,Perks/Benefits,Skills/Tech-stack required,Educational requirements,seniority,yoe,employment_type,extra_info,found_when,created_at
5,Artificial Intelligence,Senior Software Engineer,"Redmond, Washington, United States",Collaborate with stakeholders to determine use...,Benefits\nBonus\nEquity,"AI/ML, Audio Processing, C#, C++, Data Compres...","Bachelor's Degree, Master's Degree, PhD",Senior level / Expert,4,Full Time,USD 119K-258K,Found 1d ago,2025-08-08
7,Artificial Intelligence,"Software Engineer, AI Foundations","New Taipei, Banqiao District, New Taipei City,...",Collaborate across teams to influence roadmaps...,Paid time off\nPerformance bonus\nTraining & d...,"Big Data, Big data processing, C++, Communicat...","Bachelor's Degree, Master's Degree",Mid-level / Intermediate,2,Full Time,,Found 1d ago,2025-08-08
9,Artificial Intelligence,AI Agent Developer,"Manila, Metro Manila, Philippines",Collaborate with cross-functional teams\nDesig...,,"Azure AI, Azure Cognitive, Azure Cognitive Sea...","Bachelor's Degree, Master's Degree",Senior level / Expert,2,Full Time,,Found 1d ago,2025-08-08
11,Artificial Intelligence,Forward Deployed AI Engineer - Enterprise,"London, UK, San Francisco & Palo Alto, CA",Add logging and metrics\nAnalyze logs and prom...,Career growth resources\nDiverse experiences e...,"AI ecosystem, AI ecosystem knowledge, Artifici...","Bachelor's Degree, Master's Degree",Senior level / Expert,6,Full Time,USD 180K-440K,Found 1d ago,2025-08-08
13,Artificial Intelligence,Project Lead-App Development,"INDIA - NOIDA- BIRLASOFT OFFICE, IN",Collaborate with teams to translate business n...,Development programs\nEducational support\nFle...,"AI Governance, AWS, Azure, Databricks, Embeddi...","Bachelor, Master",Senior level / Expert,3,Full Time,,Found 1d ago,2025-08-08


In [21]:
df.rename(columns={'site': 'site (remote country)'}, inplace=True)
df.drop('found_when', axis=1, inplace=True)
df.head()

Unnamed: 0,topic,job_title,site (remote country),tasks,Perks/Benefits,Skills/Tech-stack required,Educational requirements,seniority,yoe,employment_type,extra_info,created_at
5,Artificial Intelligence,Senior Software Engineer,"Redmond, Washington, United States",Collaborate with stakeholders to determine use...,Benefits\nBonus\nEquity,"AI/ML, Audio Processing, C#, C++, Data Compres...","Bachelor's Degree, Master's Degree, PhD",Senior level / Expert,4,Full Time,USD 119K-258K,2025-08-08
7,Artificial Intelligence,"Software Engineer, AI Foundations","New Taipei, Banqiao District, New Taipei City,...",Collaborate across teams to influence roadmaps...,Paid time off\nPerformance bonus\nTraining & d...,"Big Data, Big data processing, C++, Communicat...","Bachelor's Degree, Master's Degree",Mid-level / Intermediate,2,Full Time,,2025-08-08
9,Artificial Intelligence,AI Agent Developer,"Manila, Metro Manila, Philippines",Collaborate with cross-functional teams\nDesig...,,"Azure AI, Azure Cognitive, Azure Cognitive Sea...","Bachelor's Degree, Master's Degree",Senior level / Expert,2,Full Time,,2025-08-08
11,Artificial Intelligence,Forward Deployed AI Engineer - Enterprise,"London, UK, San Francisco & Palo Alto, CA",Add logging and metrics\nAnalyze logs and prom...,Career growth resources\nDiverse experiences e...,"AI ecosystem, AI ecosystem knowledge, Artifici...","Bachelor's Degree, Master's Degree",Senior level / Expert,6,Full Time,USD 180K-440K,2025-08-08
13,Artificial Intelligence,Project Lead-App Development,"INDIA - NOIDA- BIRLASOFT OFFICE, IN",Collaborate with teams to translate business n...,Development programs\nEducational support\nFle...,"AI Governance, AWS, Azure, Databricks, Embeddi...","Bachelor, Master",Senior level / Expert,3,Full Time,,2025-08-08


In [33]:
from faker import Faker

# Initialize Faker
fake = Faker()

# Generate fake IT companies and emails
companies = []
for _ in range(45):
    company_name = fake.company()
    # Normalize company name to be usable in an email
    email_name = company_name.lower().replace(" ", "_").replace(",", "").replace(".", "")
    email = f"{email_name}@demo"
    companies.append({"company_name": company_name, "email": email})

# Create DataFrame
df_companies = pd.DataFrame(companies)
df_companies

Unnamed: 0,company_name,email
0,Vazquez-Henderson,vazquez-henderson@demo
1,Parker-Kemp,parker-kemp@demo
2,Parks and Sons,parks_and_sons@demo
3,James-Lowery,james-lowery@demo
4,"Stewart, Freeman and Fox",stewart_freeman_and_fox@demo
5,Mckenzie-Callahan,mckenzie-callahan@demo
6,Riley-Gonzales,riley-gonzales@demo
7,"Richardson, Kennedy and Johnson",richardson_kennedy_and_johnson@demo
8,"Erickson, Bean and Robinson",erickson_bean_and_robinson@demo
9,"Olson, Aguilar and Thomas",olson_aguilar_and_thomas@demo


In [34]:
import random
import numpy as np

TOPIC_PREFIX = {
    "Artificial Intelligence": "ai",
    "Big Data": "bd",
    "Computer Vision": "cv",
    "Cloud Computing": "cloud",
    "Data Science": "ds",
    "Machine Learning": "ml",
    "MLOps": "mlops",
    "Natural Language Processing": "nlp",
}

def topic_to_prefix(topic: str) -> str:
    if not isinstance(topic, str) or not topic.strip():
        return "gen"
    # try direct map
    if topic in TOPIC_PREFIX:
        return TOPIC_PREFIX[topic]
    # fuzzy fallbacks
    t = topic.lower()
    if "natural" in t and "language" in t: return "nlp"
    if "computer" in t and "vision" in t:  return "cv"
    if "cloud" in t:                        return "cloud"
    if "big" in t and "data" in t:          return "bd"
    if "data" in t and "science" in t:      return "ds"
    if "machine" in t and "learn" in t:     return "ml"
    if "mlops" in t or ("ml" in t and "ops" in t): return "mlops"
    if "artificial" in t and "intell" in t: return "ai"
    return re.sub(r"[^a-z0-9]+", "-", t).strip("-")[:8] or "gen"

def slugify(text: str) -> str:
    if not isinstance(text, str):
        return "role"
    s = text.lower()
    s = s.replace("&", " and ")
    s = re.sub(r"[^a-z0-9]+", "-", s)      # non-alnum → hyphen
    s = re.sub(r"-{2,}", "-", s).strip("-")
    return s or "role"

# pick which title column your df uses
title_col = "job_title" if "job_title" in df.columns else ("title_h1" if "title_h1" in df.columns else None)
if title_col is None:
    raise ValueError("No job title column found. Expecting 'job_title' or 'title_h1' in df.")

# base slug per row: "{topic_prefix}-{title_slug}"
topic_prefixes = df["topic"].apply(topic_to_prefix)
title_slugs = df[title_col].fillna("").apply(slugify)
base_slug = topic_prefixes + "-" + title_slugs

# make sequence numbers per base to guarantee uniqueness
seq = base_slug.groupby(base_slug).cumcount() + 1

# choose padding width: 2 by default; bump to 3 if any base has >= 100 occurrences
max_counts = base_slug.value_counts().max()
pad = 3 if max_counts and max_counts >= 100 else 2
df["job_id"] = base_slug + "-" + seq.astype(str).str.zfill(pad)

# assign a random posted_by from df_companies['email']
# (use a seeded RNG for reproducibility if you want deterministic results)
rng = np.random.default_rng(seed=42)
df["posted_by"] = rng.choice(df_companies["email"].values, size=len(df))

df.head()

Unnamed: 0,topic,job_title,site (remote country),tasks,Perks/Benefits,Skills/Tech-stack required,Educational requirements,seniority,yoe,employment_type,extra_info,created_at,job_id,posted_by
5,Artificial Intelligence,Senior Software Engineer,"Redmond, Washington, United States",Collaborate with stakeholders to determine use...,Benefits\nBonus\nEquity,"AI/ML, Audio Processing, C#, C++, Data Compres...","Bachelor's Degree, Master's Degree, PhD",Senior level / Expert,4,Full Time,USD 119K-258K,2025-08-08,ai-senior-software-engineer-01,stewart_freeman_and_fox@demo
7,Artificial Intelligence,"Software Engineer, AI Foundations","New Taipei, Banqiao District, New Taipei City,...",Collaborate across teams to influence roadmaps...,Paid time off\nPerformance bonus\nTraining & d...,"Big Data, Big data processing, C++, Communicat...","Bachelor's Degree, Master's Degree",Mid-level / Intermediate,2,Full Time,,2025-08-08,ai-software-engineer-ai-foundations-01,burns-parsons@demo
9,Artificial Intelligence,AI Agent Developer,"Manila, Metro Manila, Philippines",Collaborate with cross-functional teams\nDesig...,,"Azure AI, Azure Cognitive, Azure Cognitive Sea...","Bachelor's Degree, Master's Degree",Senior level / Expert,2,Full Time,,2025-08-08,ai-ai-agent-developer-01,klein_adams_and_lee@demo
11,Artificial Intelligence,Forward Deployed AI Engineer - Enterprise,"London, UK, San Francisco & Palo Alto, CA",Add logging and metrics\nAnalyze logs and prom...,Career growth resources\nDiverse experiences e...,"AI ecosystem, AI ecosystem knowledge, Artifici...","Bachelor's Degree, Master's Degree",Senior level / Expert,6,Full Time,USD 180K-440K,2025-08-08,ai-forward-deployed-ai-engineer-enterprise-01,chan_vazquez_and_carter@demo
13,Artificial Intelligence,Project Lead-App Development,"INDIA - NOIDA- BIRLASOFT OFFICE, IN",Collaborate with teams to translate business n...,Development programs\nEducational support\nFle...,"AI Governance, AWS, Azure, Databricks, Embeddi...","Bachelor, Master",Senior level / Expert,3,Full Time,,2025-08-08,ai-project-lead-app-development-01,chan_vazquez_and_carter@demo


In [35]:
job_counts = df["posted_by"].value_counts().rename_axis("email").reset_index(name="posted_jobs")

# Merge into df_companies, fill NaN with 0
df_companies = df_companies.merge(job_counts, on="email", how="left")
df_companies["posted_jobs"] = df_companies["posted_jobs"].fillna(0).astype(int)

# Optional: sort by most active posters
df_companies = df_companies.sort_values("posted_jobs", ascending=False)

df_companies.head()

Unnamed: 0,company_name,email,posted_jobs
35,"Stewart, Gates and Kelley",stewart_gates_and_kelley@demo,12
20,Martin-Bennett,martin-bennett@demo,11
25,Friedman LLC,friedman_llc@demo,8
19,"Chan, Vazquez and Carter",chan_vazquez_and_carter@demo,8
4,"Stewart, Freeman and Fox",stewart_freeman_and_fox@demo,8


In [36]:
df.to_csv('jobs.csv', index=False)
df_companies.to_csv('recruiters.csv', index=False)

In [1]:
import pandas as pd

questions_data = {
    "Artificial Intelligence": [
        ("What is the difference between narrow AI and general AI?",
         ["Narrow AI focuses on specific tasks, general AI can perform any human task",
          "Narrow AI is faster, general AI is slower",
          "Narrow AI uses neural networks, general AI does not",
          "Narrow AI is outdated, general AI is modern"],
         "Narrow AI focuses on specific tasks, general AI can perform any human task"),
        ("What is the Turing Test used for?",
         ["To measure the speed of AI models",
          "To determine if a machine exhibits human-like intelligence",
          "To test the accuracy of AI predictions",
          "To evaluate AI hardware performance"],
         "To determine if a machine exhibits human-like intelligence"),
        ("Which of the following is an example of reinforcement learning?",
         ["Teaching a robot to walk by rewarding progress",
          "Sorting emails into spam and inbox",
          "Clustering customers by purchase behavior",
          "Predicting tomorrow’s weather"],
         "Teaching a robot to walk by rewarding progress"),
        ("What is AI bias?",
         ["Errors caused by biased training data",
          "AI systems running too slowly",
          "Incorrect hyperparameter tuning",
          "Lack of neural network layers"],
         "Errors caused by biased training data"),
        ("Which is a common use of AI in healthcare?",
         ["Diagnosing diseases from medical images",
          "Encrypting patient data",
          "Scheduling hospital staff shifts",
          "Inventory management for drugs"],
         "Diagnosing diseases from medical images"),
        ("What is a knowledge graph?",
         ["A database that stores facts and relationships",
          "A type of decision tree",
          "A neural network visualizer",
          "A clustering algorithm"],
         "A database that stores facts and relationships"),
        ("What does explainable AI focus on?",
         ["Improving AI inference speed",
          "Making AI decisions interpretable to humans",
          "Compressing AI models",
          "Encrypting AI models for security"],
         "Making AI decisions interpretable to humans"),
        ("What is transfer learning in AI?",
         ["Using knowledge from one task to improve another",
          "Training from scratch with more data",
          "Compressing large AI models",
          "Combining multiple AI models"],
         "Using knowledge from one task to improve another"),
        ("What is an expert system?",
         ["An AI system using rules and logic to mimic human expertise",
          "A cloud-based AI service",
          "An AI for optimizing databases",
          "A voice assistant like Alexa"],
         "An AI system using rules and logic to mimic human expertise"),
        ("What is an example of AI in IoT devices?",
         ["Smart thermostats adjusting temperature automatically",
          "Manually setting a thermostat",
          "A standard light bulb",
          "A wall clock"],
         "Smart thermostats adjusting temperature automatically"),
    ],

    "Big Data": [
        ("Which of the following best defines Big Data?",
         ["Data sets too large for traditional processing tools",
          "Small, manageable datasets",
          "Only structured databases",
          "Only unstructured text data"],
         "Data sets too large for traditional processing tools"),
        ("Which of these is a common Big Data framework?",
         ["Apache Hadoop",
          "SQLite",
          "Microsoft Access",
          "Excel"],
         "Apache Hadoop"),
        ("What does the 'V' in Big Data characteristics stand for?",
         ["Volume, Velocity, Variety",
          "Value, Visibility, Validity",
          "Vision, Verification, Visualization",
          "Version, Variation, Virtualization"],
         "Volume, Velocity, Variety"),
        ("Which is an example of unstructured data?",
         ["Customer emails",
          "Sales database records",
          "Bank transaction logs",
          "Excel spreadsheets"],
         "Customer emails"),
        ("Which tool is commonly used for distributed data processing?",
         ["Apache Spark",
          "MySQL",
          "Oracle DB",
          "PostgreSQL"],
         "Apache Spark"),
        ("What is data lake in Big Data?",
         ["A storage for raw, unprocessed data",
          "A small relational database",
          "A water-themed database",
          "A tool for data cleaning"],
         "A storage for raw, unprocessed data"),
        ("Which of these is a NoSQL database?",
         ["MongoDB",
          "PostgreSQL",
          "MySQL",
          "Oracle"],
         "MongoDB"),
        ("What is real-time analytics?",
         ["Processing and analyzing data as it arrives",
          "Analyzing data weekly",
          "Only processing old datasets",
          "Using batch processing only"],
         "Processing and analyzing data as it arrives"),
        ("Which cloud platform offers Big Data services?",
         ["AWS",
          "Photoshop",
          "Notepad++",
          "Slack"],
         "AWS"),
        ("What is the main purpose of MapReduce?",
         ["Processing large datasets in parallel",
          "Designing web pages",
          "Running simulations",
          "Creating images"],
         "Processing large datasets in parallel"),
    ],
    
    "Computer Vision": [
        ("Which algorithm is commonly used for object detection?",
         ["YOLO",
          "K-means",
          "Apriori",
          "Naive Bayes"],
         "YOLO"),
        ("What does CNN stand for?",
         ["Convolutional Neural Network",
          "Central Neural Node",
          "Coded Neural Network",
          "Complex Node Network"],
         "Convolutional Neural Network"),
        ("Which is an application of image segmentation?",
         ["Separating different objects in a medical scan",
          "Predicting weather",
          "Classifying text sentiment",
          "Sorting transactions"],
         "Separating different objects in a medical scan"),
        ("Which technique helps make CNNs invariant to small translations?",
         ["Pooling layers",
          "Dropout",
          "Batch Normalization",
          "Data Augmentation"],
         "Pooling layers"),
        ("What is the main difference between object detection and image classification?",
         ["Object detection locates and labels objects, classification only labels",
          "Classification is always faster",
          "Detection requires no training",
          "They are the same"],
         "Object detection locates and labels objects, classification only labels"),
        ("What dataset is commonly used for image classification benchmarking?",
         ["ImageNet",
          "MNIST",
          "CIFAR-10",
          "All of the above"],
         "All of the above"),
        ("Which method can be used to reduce overfitting in CNNs?",
         ["Dropout",
          "Gradient Clipping",
          "Learning Rate Decay",
          "Batch Size Increase"],
         "Dropout"),
        ("Which of these is a feature descriptor in CV?",
         ["SIFT",
          "Adam",
          "Softmax",
          "Relu"],
         "SIFT"),
        ("Which type of network is often used for semantic segmentation?",
         ["U-Net",
          "RNN",
          "DBSCAN",
          "GAN"],
         "U-Net"),
        ("What does non-maximum suppression do in object detection?",
         ["Removes overlapping bounding boxes",
          "Removes low-quality images",
          "Suppresses gradients",
          "Reduces network layers"],
         "Removes overlapping bounding boxes"),
    ],

    # TODO: Repeat similar structure for Computer Vision, Cloud Computing, Data Science, Machine Learning, MLOps, NLP
}

questions_data["Cloud Computing"] = [
    ("What is the main characteristic of cloud computing?",
     ["On-demand access to computing resources over the internet",
      "Free access to all software",
      "Offline data storage only",
      "No need for internet connection"],
     "On-demand access to computing resources over the internet"),
    
    ("Which of the following is a cloud deployment model?",
     ["Public Cloud",
      "Hybrid Cloud",
      "Private Cloud",
      "All of the above"],
     "All of the above"),
    
    ("What does IaaS stand for?",
     ["Infrastructure as a Service",
      "Internet as a Service",
      "Integration as a Service",
      "Interface as a Service"],
     "Infrastructure as a Service"),
    
    ("Which of these is an example of SaaS?",
     ["Google Workspace",
      "Amazon EC2",
      "Microsoft Azure VM",
      "Docker"],
     "Google Workspace"),
    
    ("Which cloud provider offers the EC2 service?",
     ["Amazon Web Services",
      "Microsoft Azure",
      "Google Cloud",
      "Oracle Cloud"],
     "Amazon Web Services"),
    
    ("What is serverless computing?",
     ["Running code without managing servers",
      "Computing without electricity",
      "Having no backend at all",
      "Only using local machines"],
     "Running code without managing servers"),
    
    ("Which of these is a benefit of cloud computing?",
     ["Scalability",
      "Fixed resource allocation",
      "Manual hardware upgrades",
      "Higher upfront cost"],
     "Scalability"),
    
    ("What is cloud elasticity?",
     ["Automatic scaling of resources up or down",
      "Cloud storing only elastic files",
      "Flexible internet cables",
      "Backup storage system"],
     "Automatic scaling of resources up or down"),
    
    ("Which protocol is commonly used for secure data transfer to the cloud?",
     ["HTTPS",
      "FTP",
      "Telnet",
      "SMTP"],
     "HTTPS"),
    
    ("What is a cloud region?",
     ["A geographical location where cloud data centers are located",
      "A local folder in the cloud",
      "A security certificate",
      "A user account type"],
     "A geographical location where cloud data centers are located"),
]

questions_data["Data Science"] = [
    ("What is the main goal of data science?",
     ["Extract knowledge and insights from data",
      "Design physical machines",
      "Create 3D models",
      "Manage office documents"],
     "Extract knowledge and insights from data"),
    
    ("Which of these is an example of a supervised learning algorithm?",
     ["Linear Regression",
      "K-Means Clustering",
      "PCA",
      "Apriori"],
     "Linear Regression"),
    
    ("What is the first step in a data science workflow?",
     ["Data collection",
      "Model training",
      "Feature scaling",
      "Hyperparameter tuning"],
     "Data collection"),
    
    ("Which Python library is widely used for data manipulation?",
     ["Pandas",
      "NumPy",
      "Matplotlib",
      "TensorFlow"],
     "Pandas"),
    
    ("What is feature engineering?",
     ["Creating new features from existing data to improve model performance",
      "Building a software feature",
      "Testing system security",
      "Debugging a program"],
     "Creating new features from existing data to improve model performance"),
    
    ("What is the purpose of cross-validation?",
     ["Evaluate a model’s ability to generalize",
      "Speed up training",
      "Encrypt data",
      "Compress files"],
     "Evaluate a model’s ability to generalize"),
    
    ("Which of these is an example of an unsupervised learning task?",
     ["Clustering",
      "Regression",
      "Classification",
      "Time series forecasting"],
     "Clustering"),
    
    ("What does EDA stand for?",
     ["Exploratory Data Analysis",
      "Extended Data Architecture",
      "External Data Access",
      "Enterprise Data Automation"],
     "Exploratory Data Analysis"),
    
    ("Which metric is commonly used to evaluate classification models?",
     ["Accuracy",
      "Mean Squared Error",
      "RMSE",
      "R-squared"],
     "Accuracy"),
    
    ("What is the main purpose of data visualization?",
     ["Communicate insights from data clearly",
      "Store data",
      "Remove outliers",
      "Normalize datasets"],
     "Communicate insights from data clearly"),
]

questions_data["Machine Learning"] = [
    ("What is overfitting in machine learning?",
     ["Model performs well on training data but poorly on unseen data",
      "Model performs well on all datasets",
      "Model underestimates outputs",
      "Model has no parameters"],
     "Model performs well on training data but poorly on unseen data"),
    
    ("Which algorithm is used for classification tasks?",
     ["Logistic Regression",
      "K-Means",
      "PCA",
      "DBSCAN"],
     "Logistic Regression"),
    
    ("What is the purpose of a learning rate in optimization?",
     ["Control step size during gradient descent",
      "Determine number of layers",
      "Set batch size",
      "Reduce overfitting"],
     "Control step size during gradient descent"),
    
    ("Which of these is an ensemble learning method?",
     ["Random Forest",
      "Linear Regression",
      "KNN",
      "Naive Bayes"],
     "Random Forest"),
    
    ("What is the difference between supervised and unsupervised learning?",
     ["Supervised uses labeled data, unsupervised uses unlabeled data",
      "Supervised is faster",
      "Unsupervised always uses neural networks",
      "They are the same"],
     "Supervised uses labeled data, unsupervised uses unlabeled data"),
    
    ("Which activation function is commonly used in hidden layers?",
     ["ReLU",
      "Softmax",
      "Sigmoid",
      "Tanh"],
     "ReLU"),
    
    ("What is a confusion matrix used for?",
     ["Evaluate classification performance",
      "Store predictions",
      "Visualize training loss",
      "Debug code"],
     "Evaluate classification performance"),
    
    ("Which of the following is a hyperparameter?",
     ["Learning rate",
      "Model predictions",
      "Accuracy score",
      "Loss value"],
     "Learning rate"),
    
    ("What does regularization help with?",
     ["Reducing overfitting",
      "Speeding up training",
      "Improving GPU performance",
      "Removing null values"],
     "Reducing overfitting"),
    
    ("Which library is popular for deep learning in Python?",
     ["TensorFlow",
      "Pandas",
      "Matplotlib",
      "Seaborn"],
     "TensorFlow"),
]

questions_data["MLOps"] = [
    ("What is the main goal of MLOps?",
     ["Streamline and automate the ML lifecycle",
      "Create more ML models manually",
      "Replace developers",
      "Build UI applications"],
     "Streamline and automate the ML lifecycle"),
    
    ("Which tool is commonly used for experiment tracking?",
     ["MLflow",
      "Docker",
      "GitHub",
      "Postman"],
     "MLflow"),
    
    ("What does CI/CD stand for in MLOps?",
     ["Continuous Integration / Continuous Deployment",
      "Cloud Integration / Code Delivery",
      "Continuous Improvement / Continuous Design",
      "Code Integration / Continuous Debugging"],
     "Continuous Integration / Continuous Deployment"),
    
    ("What is model drift?",
     ["Change in model performance due to data distribution shifts",
      "When a model moves to another cloud provider",
      "A slow training process",
      "Using outdated libraries"],
     "Change in model performance due to data distribution shifts"),
    
    ("Which containerization tool is widely used in MLOps?",
     ["Docker",
      "Kubernetes",
      "Podman",
      "VirtualBox"],
     "Docker"),
    
    ("What is the purpose of a feature store?",
     ["Centralized repository for storing ML features",
      "Database for model predictions",
      "Version control for code",
      "Backup system"],
     "Centralized repository for storing ML features"),
    
    ("Which orchestration tool is often used in MLOps pipelines?",
     ["Apache Airflow",
      "NumPy",
      "Jupyter",
      "Flask"],
     "Apache Airflow"),
    
    ("What is the role of monitoring in MLOps?",
     ["Track model performance and detect anomalies",
      "Optimize hyperparameters",
      "Store datasets",
      "Generate new features"],
     "Track model performance and detect anomalies"),
    
    ("Which cloud service offers SageMaker for MLOps?",
     ["AWS",
      "Azure",
      "GCP",
      "IBM Cloud"],
     "AWS"),
    
    ("What is the main benefit of model versioning?",
     ["Reproduce and compare models over time",
      "Improve GPU speed",
      "Reduce dataset size",
      "Encrypt models"],
     "Reproduce and compare models over time"),
]

questions_data["Natural Language Processing"] = [
    ("What is the main goal of NLP?",
     ["Enable machines to understand and process human language",
      "Teach humans programming languages",
      "Translate code into binary",
      "Build operating systems"],
     "Enable machines to understand and process human language"),
    
    ("Which of these is a common NLP task?",
     ["Sentiment Analysis",
      "Sorting numbers",
      "Rendering 3D graphics",
      "Compiling code"],
     "Sentiment Analysis"),
    
    ("What does tokenization do?",
     ["Splits text into smaller units like words or subwords",
      "Encrypts text",
      "Combines sentences",
      "Translates text"],
     "Splits text into smaller units like words or subwords"),
    
    ("Which model architecture is popular in NLP?",
     ["Transformer",
      "Convolutional Neural Network",
      "Decision Tree",
      "Random Forest"],
     "Transformer"),
    
    ("What does BERT stand for?",
     ["Bidirectional Encoder Representations from Transformers",
      "Binary Encoding Recurrent Transformer",
      "Basic Encoder Representation Technique",
      "Bidirectional Entity Recognition Tool"],
     "Bidirectional Encoder Representations from Transformers"),
    
    ("Which library is widely used for NLP in Python?",
     ["Hugging Face Transformers",
      "OpenCV",
      "Matplotlib",
      "scikit-learn"],
     "Hugging Face Transformers"),
    
    ("What is lemmatization?",
     ["Reducing words to their base or dictionary form",
      "Removing punctuation",
      "Counting word frequencies",
      "Sorting sentences"],
     "Reducing words to their base or dictionary form"),
    
    ("What is named entity recognition (NER)?",
     ["Identifying and classifying entities like names, dates, locations in text",
      "Recognizing code functions",
      "Detecting syntax errors",
      "Analyzing images"],
     "Identifying and classifying entities like names, dates, locations in text"),
    
    ("Which of the following is a language model?",
     ["GPT-3",
      "ResNet",
      "YOLO",
      "XGBoost"],
     "GPT-3"),
    
    ("What is stop word removal?",
     ["Eliminating common words like 'and', 'the' from text",
      "Deleting rare words",
      "Removing typos",
      "Filtering out foreign words"],
     "Eliminating common words like 'and', 'the' from text"),
]

rows = []
for topic, q_list in questions_data.items():
    for question, options, correct in q_list:
        rows.append({
            "topic": topic,
            "question": question,
            "options": options,
            "correct_answer": correct
        })

df_questions_mcq = pd.DataFrame(rows)
df_questions_mcq.to_csv("interview_questions_mcq.csv", index=False)

print(f"Dataset saved with {len(df_questions_mcq)} questions.")
print(df_questions_mcq.head())


Dataset saved with 80 questions.
                     topic                                           question  \
0  Artificial Intelligence  What is the difference between narrow AI and g...   
1  Artificial Intelligence                  What is the Turing Test used for?   
2  Artificial Intelligence  Which of the following is an example of reinfo...   
3  Artificial Intelligence                                   What is AI bias?   
4  Artificial Intelligence         Which is a common use of AI in healthcare?   

                                             options  \
0  [Narrow AI focuses on specific tasks, general ...   
1  [To measure the speed of AI models, To determi...   
2  [Teaching a robot to walk by rewarding progres...   
3  [Errors caused by biased training data, AI sys...   
4  [Diagnosing diseases from medical images, Encr...   

                                      correct_answer  
0  Narrow AI focuses on specific tasks, general A...  
1  To determine if a machine exhi