# Convert Offers Json to CSV

In [2]:
import re
import json
import pandas as pd  # type: ignore
from pathlib import Path
from datetime import datetime

In [3]:
class JobOffersJsonToCsvConverter:
    """
    Converts a directory of job offer JSON files to a single CSV file with one-hot encoded skills.

    Args:
        data_dir (str): Path to directory with JSON files.
        output_csv (str): Path to output CSV file.
    """

    def __init__(self, data_dir: str, output_csv: str):
        self.data_dir = Path(data_dir)
        self.output_csv = Path(output_csv)
        self.raw_data = []
        self.df = None

    def run(self):
        self.load_data()
        if not self.raw_data:
            return
        self.to_dataframe()
        self.save_to_csv()

    def load_data(self):
        if not self.data_dir.exists() or not self.data_dir.is_dir():
            raise FileNotFoundError(f"Data directory {self.data_dir} does not exist or is not a directory.")
        json_files = sorted(self.data_dir.glob("*.json"))
        if not json_files:
            raise FileNotFoundError(f"No JSON files found in {self.data_dir}")
        for file_path in json_files:
            with open(file_path, "r", encoding="utf-8") as f:
                content = json.load(f)
                offers = content.get("data", [])
                self.raw_data.extend(offers)

    def to_dataframe(self):
        records = [self._extract_record(offer) for offer in self.raw_data]
        df = pd.DataFrame(records)
        skill_set = self._extract_skills(df)
        df = self._one_hot_encode_skills(df, skill_set)
        self.df = df

    def _extract_record(self, offer):
        return {
            "slug": offer.get("slug"),
            "companyName": offer.get("companyName"),
            "title": offer.get("title"),
            "categoryId": offer.get("categoryId"),
            "requiredSkills": offer.get("requiredSkills"),
            "niceToHaveSkills": offer.get("niceToHaveSkills"),
            "workplaceType": offer.get("workplaceType"),
            "workingTime": offer.get("workingTime"),
            "experienceLevel": offer.get("experienceLevel"),
            "salaryFrom": offer.get("employmentTypes", [{}])[0].get("from"),
            "salaryTo": offer.get("employmentTypes", [{}])[0].get("to"),
            "salaryUnit": offer.get("employmentTypes", [{}])[0].get("unit"),
            "salaryGross": offer.get("employmentTypes", [{}])[0].get("gross"),
            "currency": offer.get("employmentTypes", [{}])[0].get("currency"),
            "employmentType": offer.get("employmentTypes", [{}])[0].get("type"),
            "city": offer.get("city"),
            "street": offer.get("street"),
            "latitude": offer.get("latitude"),
            "longitude": offer.get("longitude"),
            "multilocation": offer.get("multilocation"),
            "remoteInterview": offer.get("remoteInterview"),
            "openToHireUkrainians": offer.get("openToHireUkrainians"),
            "publishedAt": offer.get("publishedAt"),
        }

    def _extract_skills(self, df):
        skills = set()
        for skills_list in (
            df["requiredSkills"].dropna().tolist()
            + df["niceToHaveSkills"].dropna().tolist()
        ):
            skills.update(skills_list)
        return skills

    def _sanitize(self, skill_name):
        return re.sub(r"[^\w]+", "_", skill_name.strip())

    def _one_hot_encode_skills(self, df, skills):
        new_cols = {}
        for skill in skills:
            col_required = f"requiredSkill_{self._sanitize(skill)}"
            col_nice = f"niceToHaveSkill_{self._sanitize(skill)}"
            new_cols[col_required] = df["requiredSkills"].apply(
                lambda x: int(skill in x) if isinstance(x, list) else 0
            )
            new_cols[col_nice] = df["niceToHaveSkills"].apply(
                lambda x: int(skill in x) if isinstance(x, list) else 0
            )
        df = pd.concat([df, pd.DataFrame(new_cols)], axis=1)
        return df.drop(columns=["requiredSkills", "niceToHaveSkills"], errors="ignore")

    def save_to_csv(self):
        self.output_csv.parent.mkdir(parents=True, exist_ok=True)
        self.df.to_csv(self.output_csv, index=False)

In [4]:
today_str = datetime.now().strftime("%Y-%m-%d")
output_path = f"../data/processed/{today_str}-offers.csv"

converter = JobOffersJsonToCsvConverter(
    data_dir="../data/raw/2025-06-24",
    output_csv=output_path,
)

converter.run()