# Data Preprocessing
This is the first notebook used to clean and export a dataset into a JSON file.  

# Clean Data

In [4]:
import pandas as pd
import json
import difflib
from collections import defaultdict

df = pd.read_csv("resume_data.csv")


useful_cols = [
    "skills",
    "skills_required", 
    "start_dates",
    "end_dates",
    "positions", 
    "certification_skills"
]

df_useful = df[useful_cols]
df_useful = df[useful_cols].copy()
df_useful.head()

Unnamed: 0,skills,skills_required,start_dates,end_dates,positions,certification_skills
0,"['Big Data', 'Hadoop', 'Hive', 'Python', 'Mapr...",,['Nov 2019'],['Till Date'],['Big Data Analyst'],
1,"['Data Analysis', 'Data Analytics', 'Business ...",,['Sep 2019'],['Till Date'],['Business Analyst'],
2,"['Software Development', 'Machine Learning', '...",Brand Promotion\nCampaign Management\nField Su...,['June 2018'],['Till Date'],['Software Developer (Machine Learning Enginee...,
3,"['accounts payables', 'accounts receivables', ...",Fast typing skill\nIELTSInternet browsing & on...,"['January 2011', 'January 2008', 'January 2006...","['November 2015', 'January 2010', 'January 200...","['Accountant', 'Accounts Receivable Clerk', 'M...",
4,"['Analytical reasoning', 'Compliance testing k...",iOS\niOS App Developer\niOS Application Develo...,"['January 2016', 'January 2016', 'January 2012...","['Current', 'January 2016', 'January 2015', 'J...","['Staff Accountant', 'Senior Accountant', 'Tax...",[None]


In [25]:
# for second DF : 
import pandas as pd
import json
import difflib
from collections import defaultdict

df = pd.read_csv("merged_jobs.csv")
df.rename(columns={
    "title": "positions",
    "start_date": "start_dates",
    "end_date": "end_dates"
}, inplace=True)
df.drop(columns=["person_id"], inplace=True)
df.head()

df_useful = df

In [26]:
import ast

def ensure_list(x):
    if pd.isna(x):
        return []
    if isinstance(x, list):
        return x
    try:
        return ast.literal_eval(str(x))
    except:
        return [str(x)]

for col in ["start_dates", "end_dates", "positions"]:
    df_useful[col] = df_useful[col].apply(ensure_list)


In [27]:
import re
from datetime import datetime

current_year = datetime.now().year

def extract_year(text):
    if not text:
        return None
    text = str(text)
    text = re.sub(r'(?i)present', str(current_year), text)
    match = re.search(r"(19|20)\d{2}", text)
    if match:
        return int(match.group(0))
    return None

rows = []

for _, row in df_useful.iterrows():
    positions = row["positions"]
    starts = row["start_dates"]
    ends = row["end_dates"]
    skills = row["skills"]

    for i, position in enumerate(positions):
        start = starts[i] if i < len(starts) else None
        end = ends[i] if i < len(ends) else None

        rows.append({
            "position": position,
            "start_year": extract_year(start),
            "end_year": extract_year(end),
            "skills": skills
        })

df_useful_expanded = pd.DataFrame(rows)


# Convert the whole column to nullable integer
df_useful_expanded["start_year"] = df_useful_expanded["start_year"].astype("Int64")
df_useful_expanded["end_year"] = df_useful_expanded["end_year"].astype("Int64")
df_useful_expanded.head()


TypeError: object of type 'int' has no len()

In [17]:
df_useful_expanded = df_useful_expanded.dropna(subset=["skills"])

# Creating JSON

In [18]:
import json

# Group by start_year
year_groups = df_useful_expanded.groupby("start_year")

json_data = []

for year, group in year_groups:
    positions_list = []
    for _, row in group.iterrows():
        positions_list.append({
            "position": row["position"],
            "skills": row["skills"]
        })
    json_data.append({
        "year": int(year),
        "positions": positions_list
    })

# Optional: sort by year
json_data = sorted(json_data, key=lambda x: x["year"])


In [None]:
with open("career_by_year.json", "w") as f:
    json.dump(json_data, f, indent=4)


In [29]:
df_useful_expanded.to_csv("cleaned_jobs1.csv", index=False)