In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

print(sys.executable)
print("Environment OK")

In [None]:
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path.cwd().parent  # assumes you're inside /notebooks
RAW_DIR = PROJECT_ROOT / "data" / "raw"

# list candidate files
candidates = sorted(list(RAW_DIR.glob("*.csv")))
if not candidates:
    raise FileNotFoundError(
        f"No CSV found in {RAW_DIR}. Put your Kaggle CSV there."
    )

print("Found CSV files:")
for i, p in enumerate(candidates):
    print(f"{i}: {p.name}")

# pick the first by default (change index if needed)
csv_path = candidates[0]
print("\nUsing:", csv_path.name)

df = pd.read_csv(csv_path)
df.head(3)

In [None]:
print("Shape:", df.shape)
print("\nColumns:")
print(df.columns.tolist())
df.sample(3, random_state=42)

In [None]:
cols = [c.lower() for c in df.columns]
print("\nLikely title columns:", [df.columns[i] for i,c in enumerate(cols) if "title" in c])
print("Likely description columns:", [df.columns[i] for i,c in enumerate(cols) if "desc" in c or "description" in c])
print("Likely location columns:", [df.columns[i] for i,c in enumerate(cols) if "loc" in c or "city" in c or "country" in c])
print("Likely date columns:", [df.columns[i] for i,c in enumerate(cols) if "date" in c or "posted" in c or "time" in c])

In [None]:
df_core = (
    df[["title", "description", "location", "listed_time"]]
    .rename(columns={"listed_time": "date"})
    .copy()
)

df_core.head(3)

In [None]:
df_core["date"] = pd.to_datetime(df_core["date"], unit="ms", errors="coerce")

df_core["date"].head()

df_core["date"].min(), df_core["date"].max()

In [None]:
initial_rows = len(df_core)

df_core = df_core.dropna(subset=["title", "description", "date"])
df_core = df_core[df_core["description"].str.len() >= 100]

final_rows = len(df_core)

print(f"Rows before: {initial_rows}")
print(f"Rows after : {final_rows}")
print(f"Kept {final_rows / initial_rows:.1%} of data")

In [None]:
import re

def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"<[^>]+>", " ", text)      # remove HTML
    text = re.sub(r"\s+", " ", text).strip()  # normalize spaces
    return text

df_core["title_clean"] = df_core["title"].apply(clean_text)
df_core["description_clean"] = df_core["description"].apply(clean_text)

df_core[["title_clean", "description_clean"]].head(2)

In [None]:
df_core.info()
df_core.sample(3, random_state=42)

In [None]:
from pathlib import Path
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

out_path = PROCESSED_DIR / "job_postings_clean.parquet"
df_core.to_parquet(out_path, index=False)

print("Saved to:", out_path)

In [None]:
df_time = df_core.copy()
df_time = df_time.set_index("date").sort_index()

df_time.head(3)

In [None]:
import pandas as pd

split_date = df_time.index.min() + (df_time.index.max() - df_time.index.min()) / 2

df_time["period"] = [
    "early_period" if d <= split_date else "late_period"
    for d in df_time.index
]

print("Split date:", split_date)
df_time["period"].value_counts()

df_time.groupby("period").apply(lambda x: (x.index.min(), x.index.max()))

In [None]:
monthly_counts = (
    df_time
    .resample("M")
    .size()
    .rename("num_postings")
)

monthly_counts.head()

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 4))
monthly_counts.plot()
plt.title("Job Postings Over Time (Monthly)")
plt.ylabel("Number of postings")
plt.xlabel("Date")
plt.tight_layout()
plt.show()

In [None]:
print("Start date:", monthly_counts.index.min().date())
print("End date  :", monthly_counts.index.max().date())
print("Total months:", len(monthly_counts))

In [None]:
yearly_counts = (
    df_time
    .groupby(df_time.index.year)
    .size()
)

yearly_counts

plt.figure(figsize=(6, 4))
yearly_counts.plot(kind="bar")
plt.title("Job Postings by Year")
plt.ylabel("Number of postings")
plt.xlabel("Year")
plt.tight_layout()
plt.show()

In [None]:
top_titles = (
    df_time["title_clean"]
    .value_counts()
    .head(10)
    .index
)

top_titles

In [None]:
title_trends = (
    df_time[df_time["title_clean"].isin(top_titles)]
    .groupby([pd.Grouper(freq="M"), "title_clean"])
    .size()
    .unstack(fill_value=0)
)

title_trends.head()

plt.figure(figsize=(12, 5))
title_trends.plot(ax=plt.gca())
plt.title("Monthly Trends for Top Job Titles")
plt.ylabel("Number of postings")
plt.xlabel("Date")
plt.legend(bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
plt.show()

The dataset covers a short but dense time window from late March to April 2024.
While this limits long-term trend analysis, it is well suited for cross-sectional and short-term signal extraction.
Posting activity remains consistent across the window, suggesting stable data ingestion rather than sporadic scraping.
This scope supports semantic clustering of roles and skills, as well as comparative analysis between sub-periods and locations.