In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.__version__


In [None]:
df=pd.read_csv("ai_job_dataset.csv")

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.columns

In [None]:
print("number of duplicate rows", df.duplicated().sum())

In [None]:
print(df.isnull().sum())

In [None]:
df['posting_date'] = pd.to_datetime(df['posting_date'], errors='coerce')
df['application_deadline'] = pd.to_datetime(df['application_deadline'] ,errors='coerce')

In [None]:
df['salary_usd'] = pd.to_numeric(df['salary_usd'] , errors='coerce')
df['years_experience'] = pd.to_numeric(df['years_experience'] , errors='coerce')

In [None]:
experience_map = {
    'EN': 'Entry-level',
    'MI': 'Mid-level',
    'SE': 'Senior-level',
    'EX': 'Executive'
}

df['experience_level'] = df['experience_level'].replace(experience_map)

employment_type_map = {
    'FT': 'Full-Time',
    'PT': 'Part-Time',
    'CT': 'Contract',
    'FL': 'Freelance'
}

df['employment_type'] = df['employment_type'].replace(employment_type_map)

remote_map = {
    0: 'Onsite',
    50: 'Hybrid',
    100: 'Remote'
}

df['work_mode'] = df['remote_ratio'].replace(remote_map)

In [None]:
df[['experience_level', 'employment_type', 'remote_ratio']].drop_duplicates().head(10)

In [None]:
print(df.dtypes)

In [None]:
columns_to_check = ['salary_usd' , 'years_experience']
for col in columns_to_check:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3-Q1
    lower_bound= Q1 - 1.5 * IQR
    upper_bound= Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    print(f"Column: {col} | Outliers found: {outliers.shape[0]}")
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    print( "shape after rempving salary and experience outliers:" , df.shape)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.to_csv("ai_jobs_clean.csv" , index=False)

In [None]:
column_descriptions = {
    'job_title': 'Job title related to AI and data roles',
    'salary_usd': 'Annual salary expressed in USD',
    'country': 'Country where the job is located',
    'company_location': 'Location of the hiring company',
    'year': 'Year of the job posting',
    'experience_level': 'Level of professional experience (Entry-level, Mid-level, Senior-level, Executive)',
    'employment_type': 'Type of employment contract (Full-Time, Part-Time, Contract, Freelance)',
    'remote_ratio': 'Numeric indicator of remote work (0 = Onsite, 50 = Hybrid, 100 = Remote)',
    'work_mode': 'Descriptive work mode derived from remote_ratio (Onsite, Hybrid, Remote)',
    'job_count': 'Job demand counter used for aggregation in BI analysis'
}

data_dict = pd.DataFrame({
    'Column Name': df.columns,
    'Data Type': [df[col].dtype for col in df.columns],
    'Description': [column_descriptions.get(col, 'Additional job-related attribute') for col in df.columns],
    'Example Value': [
        df[col].dropna().iloc[0] if not df[col].dropna().empty else None
        for col in df.columns
    ]
})

data_dict.to_excel("ai_jobs_data_dictionary.xlsx", index=False)