In [102]:
import os
import kagglehub
import pandas as pd
from sklearn.utils import shuffle

In [103]:
# All datasets
# Dataset_01: https://www.kaggle.com/datasets/gauravduttakiit/resume-dataset/
dataset_01 = pd.read_csv('/content/gauravduttakiit-resume-dataset.csv')

# Dataset_02: https://www.kaggle.com/datasets/snehaanbhawal/resume-dataset
dataset_02 = pd.read_csv('/content/snehaanbhawal-resume-dataset.csv', engine='python', on_bad_lines='skip')

In [104]:
# Deleting unnesesarry colunms
dataset_02.drop(columns=["ID","Resume_html"], inplace=True)

In [105]:
dataset_01.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [106]:
categories = dataset_01['Category'].unique()
print("Number of unique categories in Dataset - 01:", len(categories))
print(categories)

Number of unique categories in Dataset - 01: 25
['Data Science' 'HR' 'Advocate' 'Arts' 'Web Designing'
 'Mechanical Engineer' 'Sales' 'Health and fitness' 'Civil Engineer'
 'Java Developer' 'Business Analyst' 'SAP Developer' 'Automation Testing'
 'Electrical Engineering' 'Operations Manager' 'Python Developer'
 'DevOps Engineer' 'Network Security Engineer' 'PMO' 'Database' 'Hadoop'
 'ETL Developer' 'DotNet Developer' 'Blockchain' 'Testing']


In [107]:
# Rename columns
dataset_02 = dataset_02.rename(columns={"Resume_str": "Resume"})
# Reorder columns
dataset_02 = dataset_02[["Category", "Resume"]]

# List of abbreviations to preserve
keep_upper = ["HR", "PMO", "BPO"]

# format categories
def format_category(cat):
    cat_upper = cat.strip().upper()
    return cat_upper if cat_upper in keep_upper else cat.strip().title()

# Apply the formatting
dataset_02["Category"] = dataset_02["Category"].apply(format_category)

In [108]:
dataset_02.head()

Unnamed: 0,Category,Resume
0,HR,HR ADMINISTRATOR/MARKETING ASSOCIATE\...
1,HR,"HR SPECIALIST, US HR OPERATIONS ..."
2,HR,HR DIRECTOR Summary Over 2...
3,HR,HR SPECIALIST Summary Dedica...
4,HR,HR MANAGER Skill Highlights ...


In [109]:
categories = dataset_02['Category'].unique()
print("Number of unique categories in Dataset - 02:", len(categories))
print(categories)

Number of unique categories in Dataset - 02: 24
['HR' 'Designer' 'Information-Technology' 'Teacher' 'Advocate'
 'Business-Development' 'Healthcare' 'Fitness' 'Agriculture' 'BPO' 'Sales'
 'Consultant' 'Digital-Media' 'Automobile' 'Chef' 'Finance' 'Apparel'
 'Engineering' 'Accountant' 'Construction' 'Public-Relations' 'Banking'
 'Arts' 'Aviation']


In [110]:
# Concatenate both Dataset
combined_dataset = pd.concat([dataset_01, dataset_02], ignore_index=True)

# Remove rows with any null values
combined_dataset.dropna(inplace=True)

# Remove duplicates
combined_dataset.drop_duplicates(inplace=True)

# Shuffle the combined data
combined_dataset = shuffle(combined_dataset, random_state=42).reset_index(drop=True)

combined_dataset.head()

Unnamed: 0,Category,Resume
0,Finance,FINANCE ACCOUNTANT Summary S...
1,Arts,ASSISTANT PRINCIPAL Summary ...
2,Business-Development,BUSINESS DEVELOPMENT DIRECTOR S...
3,Apparel,"KEYHOLDER Summary Creative, ..."
4,Fitness,GENERAL MANAGER Summary ...


In [111]:
categories = combined_dataset['Category'].unique()
print("Number of unique categories in combined dataset:", len(categories))
print(categories)

Number of unique categories in combined dataset: 45
['Finance' 'Arts' 'Business-Development' 'Apparel' 'Fitness' 'Healthcare'
 'Digital-Media' 'Sales' 'Engineering' 'BPO' 'Teacher' 'Java Developer'
 'Chef' 'Public-Relations' 'HR' 'Information-Technology' 'Designer'
 'Automobile' 'Advocate' 'Database' 'Accountant' 'Banking' 'Consultant'
 'Construction' 'Mechanical Engineer' 'Civil Engineer' 'Aviation'
 'Agriculture' 'Automation Testing' 'Python Developer' 'DevOps Engineer'
 'Health and fitness' 'Electrical Engineering' 'Testing'
 'Operations Manager' 'Hadoop' 'Blockchain' 'DotNet Developer'
 'Business Analyst' 'PMO' 'SAP Developer' 'Network Security Engineer'
 'Data Science' 'Web Designing' 'ETL Developer']


In [113]:
# Save Combined Dataset
combined_dataset.to_csv("aggregated_resume_data.csv", index=False)
print("Saved Dataset!")

Saved Dataset!
