In [1]:
import pandas as pd
import re
import json

# Load the dataset
file_path = 'adzuna_data_related_jobs.csv'
df = pd.read_csv(file_path)

# Step 1: Drop the 'Company Size' column which is entirely NaN
df_cleaned = df.drop(columns=['Company Size'])

# Step 2: Rename the column 'Required Language' to 'Description'
df_cleaned.rename(columns={'Required Language': 'Description'}, inplace=True)

# Step 3: Define functions to parse the 'Description' field
def extract_language_requirements(description):
    # Look for human language mentions (e.g., English, French, etc.)
    language_keywords = re.findall(r'\b(English|French|German|Spanish|Chinese|Dutch|Portuguese|Italian|Japanese|Korean|Russian|Hindi)\b', description, re.IGNORECASE)
    return ', '.join(set(language_keywords)) if language_keywords else 'N/A'

def extract_company_info(description):
    # This function tries to find information related to company specifics or benefits.
    if 'company' in description.lower() or 'startup' in description.lower() or 'global' in description.lower():
        return "Details found in description"
    return 'N/A'

def extract_soft_skills(description):
    # Look for common soft skills (e.g., communication, leadership, teamwork, etc.)
    soft_skills_keywords = re.findall(r'\b(communication|leadership|teamwork|collaboration|problem-solving|creativity|adaptability|flexibility|time management|empathy|negotiation|critical thinking|conflict resolution|interpersonal skills|decision making|work ethic|stress management)\b', description, re.IGNORECASE)
    return ', '.join(set(soft_skills_keywords)) if soft_skills_keywords else 'N/A'

def extract_data_skills(description):
    # Look for data-related skills (e.g., Python, Spark, SQL, etc.)
    data_skills_keywords = re.findall(r'\b(Python|Java|C\+\+|JavaScript|SQL|R|Scala|AWS|Azure|GCP|Spark|Hadoop|Tableau|Power BI|Excel|Pandas|NumPy|TensorFlow|PyTorch|Keras|MATLAB|Snowflake|Docker|Kubernetes)\b', description, re.IGNORECASE)
    return ', '.join(set(data_skills_keywords)) if data_skills_keywords else 'N/A'

# Step 4: Drop duplicate rows if any exist and reset index
df_cleaned = df_cleaned.drop_duplicates().reset_index(drop=True)

# Step 5: Ensure all column names are unique
df_cleaned = df_cleaned.loc[:, ~df_cleaned.columns.duplicated()]

# Step 6: Re-apply the extraction functions using the correct method
df_cleaned['Language Requirements'] = df_cleaned['Description'].apply(lambda desc: extract_language_requirements(str(desc)))
df_cleaned['Company Info'] = df_cleaned['Description'].apply(lambda desc: extract_company_info(str(desc)))
df_cleaned['Soft Skills'] = df_cleaned['Description'].apply(lambda desc: extract_soft_skills(str(desc)))
df_cleaned['Data Skills'] = df_cleaned['Description'].apply(lambda desc: extract_data_skills(str(desc)))

# Step 7: Save the cleaned dataset to a new CSV file
df_cleaned.to_csv('adzuna_data_cleaned.csv', index=False)

# Step 8: Save all operations to a JSON file
operations = [
    {"step": 1, "action": "Drop column", "details": "Dropped 'Company Size' column which is entirely NaN"},
    {"step": 2, "action": "Rename column", "details": "Renamed 'Required Language' to 'Description'"},
    {"step": 3, "action": "Define extraction functions", "details": "Defined functions to extract language requirements, company info, soft skills, and data skills from the 'Description'"},
    {"step": 4, "action": "Drop duplicates", "details": "Dropped duplicate rows and reset index"},
    {"step": 5, "action": "Ensure unique columns", "details": "Ensured all column names are unique"},
    {"step": 6, "action": "Apply extraction functions", "details": "Applied extraction functions to 'Description' column to create new columns for 'Language Requirements', 'Company Info', 'Soft Skills', and 'Data Skills'"},
    {"step": 7, "action": "Save CSV", "details": "Saved the cleaned dataset to a new CSV file 'adzuna_data_cleaned.csv'"}
]

with open('adzuna_cleaning_operations.json', 'w') as f:
    json.dump(operations, f, indent=4)

# Step 9: Display the cleaned dataset to the user
print(df_cleaned.head())

  Country                      Job Title         Company  \
0      fr           Head of Data Science         Aircall   
1      fr  Cloud Engineer - Data Science         Lenstra   
2      fr       Head Of Data Science H/F       HelloWork   
3      fr       Head Of Data Science F/H  RFC Consulting   
4      fr       Head Of Data Science H/F       HelloWork   

               Industry Job Type Remote Type  Salary Min  Salary Max  \
0  Emplois Informatique      NaN         NaN    125000.0    150000.0   
1               Unknown      NaN   full_time     45000.0     55000.0   
2  Emplois Informatique      NaN         NaN    100000.0    125000.0   
3  Emplois Informatique      NaN         NaN    100000.0    125000.0   
4               Unknown      NaN   full_time     70000.0     70000.0   

   Experience             Post Date  \
0         NaN  2024-07-19T08:35:28Z   
1         NaN  2024-07-30T23:53:58Z   
2         NaN  2024-10-11T11:42:53Z   
3         NaN  2023-10-22T15:49:17Z   
4         N

In [3]:
import pandas as pd
import re
import json

# Load the dataset
file_path = 'adzuna_data_related_jobs.csv'
df = pd.read_csv(file_path)

# Step 1: Drop the 'Company Size' column which is entirely NaN
df_cleaned = df.drop(columns=['Company Size'])

# Step 2: Rename the column 'Required Language' to 'Description'
df_cleaned.rename(columns={'Required Language': 'Description'}, inplace=True)

# Step 3: Define functions to parse the 'Description' field
def extract_language_requirements(description):
    # Look for human language mentions (e.g., English, French, etc.)
    language_keywords = re.findall(r'\b(English|French|German|Spanish|Chinese|Dutch|Portuguese|Italian|Japanese|Korean|Russian|Hindi)\b', description, re.IGNORECASE)
    return ', '.join(set(language_keywords)) if language_keywords else 'N/A'

def extract_company_info(description):
    # This function tries to find information related to company specifics or benefits.
    if 'company' in description.lower() or 'startup' in description.lower() or 'global' in description.lower():
        return "Details found in description"
    return 'N/A'

def extract_soft_skills(description):
    # Look for common soft skills (e.g., communication, leadership, teamwork, etc.)
    soft_skills_keywords = re.findall(r'\b(communication|leadership|teamwork|collaboration|problem-solving|creativity|adaptability|flexibility|time management|empathy|negotiation|critical thinking|conflict resolution|interpersonal skills|decision making|work ethic|stress management)\b', description, re.IGNORECASE)
    return ', '.join(set(soft_skills_keywords)) if soft_skills_keywords else 'N/A'

def extract_data_skills(description):
    # Look for data-related skills (e.g., Python, Spark, SQL, etc.)
    data_skills_keywords = re.findall(r'\b(Python|Java|C\+\+|JavaScript|SQL|R|Scala|AWS|Azure|GCP|Spark|Hadoop|Tableau|Power BI|Excel|Pandas|NumPy|TensorFlow|PyTorch|Keras|MATLAB|Snowflake|Docker|Kubernetes|Flask|Django|Airflow|Git|C#|HTML|CSS|NoSQL|MongoDB|PostgreSQL|MySQL|Linux|Bash|Shell|Jupyter|Ansible|Chef|Puppet|REST API|GraphQL|SAS|SPSS|Elasticsearch|Kafka|RabbitMQ|Jenkins|Terraform|Apache|Nginx|Vagrant|OpenStack|Unity|Unreal Engine|NLTK|OpenCV|FastAPI|H2O|XGBoost|LightGBM|CatBoost|Redis|Celery|PySpark|Scala|Prometheus|Grafana|Jira|Confluence|Figma|Zeplin)\b', description, re.IGNORECASE)
    return ', '.join(set(data_skills_keywords)) if data_skills_keywords else 'N/A'

# Step 4: Drop duplicate rows if any exist and reset index
df_cleaned = df_cleaned.drop_duplicates().reset_index(drop=True)

# Step 5: Ensure all column names are unique
df_cleaned = df_cleaned.loc[:, ~df_cleaned.columns.duplicated()]

# Step 6: Re-apply the extraction functions using the correct method
df_cleaned['Language Requirements'] = df_cleaned['Description'].apply(lambda desc: extract_language_requirements(str(desc)))
df_cleaned['Company Info'] = df_cleaned['Description'].apply(lambda desc: extract_company_info(str(desc)))
df_cleaned['Soft Skills'] = df_cleaned['Description'].apply(lambda desc: extract_soft_skills(str(desc)))
df_cleaned['Data Skills'] = df_cleaned['Description'].apply(lambda desc: extract_data_skills(str(desc)))

# Step 7: Drop completely duplicate rows if any exist again after extraction
df_cleaned = df_cleaned.drop_duplicates().reset_index(drop=True)

# Step 8: Save the cleaned dataset to a new CSV file
df_cleaned.to_csv('adzuna_data_cleaned.csv', index=False)

# Step 9: Save all operations to a JSON file
operations = [
    {"step": 1, "action": "Drop column", "details": "Dropped 'Company Size' column which is entirely NaN"},
    {"step": 2, "action": "Rename column", "details": "Renamed 'Required Language' to 'Description'"},
    {"step": 3, "action": "Define extraction functions", "details": "Defined functions to extract language requirements, company info, soft skills, and data skills from the 'Description'"},
    {"step": 4, "action": "Drop duplicates", "details": "Dropped duplicate rows and reset index"},
    {"step": 5, "action": "Ensure unique columns", "details": "Ensured all column names are unique"},
    {"step": 6, "action": "Apply extraction functions", "details": "Applied extraction functions to 'Description' column to create new columns for 'Language Requirements', 'Company Info', 'Soft Skills', and 'Data Skills'"},
    {"step": 7, "action": "Drop duplicates after extraction", "details": "Dropped completely duplicate rows and reset index again after extraction"},
    {"step": 8, "action": "Save CSV", "details": "Saved the cleaned dataset to a new CSV file 'adzuna_data_cleaned.csv'"}
]

with open('adzuna_cleaning_operations.json', 'w') as f:
    json.dump(operations, f, indent=4)

# Step 10: Display the cleaned dataset to the user
print(df_cleaned.head())

  Country                      Job Title         Company  \
0      fr           Head of Data Science         Aircall   
1      fr  Cloud Engineer - Data Science         Lenstra   
2      fr       Head Of Data Science H/F       HelloWork   
3      fr       Head Of Data Science F/H  RFC Consulting   
4      fr       Head Of Data Science H/F       HelloWork   

               Industry Job Type Remote Type  Salary Min  Salary Max  \
0  Emplois Informatique      NaN         NaN    125000.0    150000.0   
1               Unknown      NaN   full_time     45000.0     55000.0   
2  Emplois Informatique      NaN         NaN    100000.0    125000.0   
3  Emplois Informatique      NaN         NaN    100000.0    125000.0   
4               Unknown      NaN   full_time     70000.0     70000.0   

   Experience             Post Date  \
0         NaN  2024-07-19T08:35:28Z   
1         NaN  2024-07-30T23:53:58Z   
2         NaN  2024-10-11T11:42:53Z   
3         NaN  2023-10-22T15:49:17Z   
4         N