In [3]:
import pandas as pd
import re
import numpy as np

# Load dataset
df = pd.read_csv("cleaned_jobs.csv")

# Show basic info
print(df.info())

# --- Clean Missing Values ---
df.dropna(subset=['Title', 'Company', 'Skills', 'Link'], inplace=True)
df['Experience'].fillna('0 years', inplace=True)
df['Salary'].fillna('Not Disclosed', inplace=True)
df['Location'].fillna('Remote', inplace=True)

# --- Normalize Text Columns ---
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    return text.strip()

df['Title'] = df['Title'].apply(clean_text)
df['Company'] = df['Company'].apply(clean_text)
df['Skills'] = df['Skills'].apply(clean_text)
df['Location'] = df['Location'].apply(clean_text)

# --- Parse Skills into List ---
df['Skills_List'] = df['Skills'].apply(lambda x: [i.strip() for i in x.split(',')])

# --- Convert Experience to Numeric ---
def extract_experience(exp):
    numbers = re.findall(r'\d+', str(exp))
    if len(numbers) == 0:
        return 0
    return int(numbers[0])

df['Experience_Years'] = df['Experience'].apply(extract_experience)

# --- Preview the cleaned dataframe ---
print(df.head())

df.to_csv("processed_data.csv", index=False)
print("Processed dataset saved as 'processed_data.csv'")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7650 entries, 0 to 7649
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Title       7650 non-null   object
 1   Company     7650 non-null   object
 2   Experience  7534 non-null   object
 3   Salary      1204 non-null   object
 4   Location    7617 non-null   object
 5   Skills      7567 non-null   object
 6   Link        7650 non-null   object
dtypes: object(7)
memory usage: 418.5+ KB
None
                                               Title               Company  \
0  internship trainee  retail branch banking  2 m...             hdfc bank   
1  internship trainee  retail branch banking 2 mo...             hdfc bank   
2                                   engineer trainee           upl limited   
3                                   trainee engineer               siemens   
4  astinil technologies is offering a intership a...  astinil technologies   

  Experience      

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Experience'].fillna('0 years', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna('Not Disclosed', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting 