# Preprocessing : 

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("indeed_raw_data.csv")
df

Unnamed: 0,Job Title,Company Name,Location,Skills,Years of Experience,Education,Job Type,Salary
0,Data Scientist,DTN,Remote,"Machine Learning, Pandas, Analytics, Software ...",2,Not Specified,Full-time,94;500 - 115;500 a year
1,Data Scientist,Generac Power Systems,Pewaukee; WI 53188,"Power BI, Data Visualization, Computer Science...",3,Master; Bachelor,Full-time,Not specified
2,Data Scientist,Big Lots,Columbus; OH 43081 (Albany Commons area),"Data Mining, Computer Science, Data Structures...",Not specified,Master; Bachelor,Full-time,Not specified
3,Data Scientist (Data Scientist 1),HII,Remote in Alexandria; VA,"Data Mining, Machine Learning, Project Managem...",4,Bachelor,Full-time,Not specified
4,Data Scientist,Microsoft,Atlanta; GA,"Data Mining, Forecasting, Computer Science, AP...",2,Doctorate; Master; Bachelor,Full-time,98;300 - 193;200 a year
...,...,...,...,...,...,...,...,...
299,Principal Data Scientist,Lawrence Berkeley National Laboratory,Hybrid work in San Francisco Bay Area; CA,"Data Analysis, Research, Machine Learning",10,Doctorate,Full-time,175;812 - 296;688 a year
300,Sr. Data Scientist,CVS Health,Remote in New York; NY,"Data Mining, TensorFlow, Natural Language Proc...",Not specified,Master,Full-time,108;306 - 196;000 a year
301,Senior Data Scientist,Providence,Remote in Renton; WA 98057,"Power BI, Data Visualization, Forecasting, Dat...",6,Master,Full-time,Not specified
302,Data Scientist II,Honeywell,Hybrid work in Kansas City; MO 64147,"Software development, Leadership, Machine Lear...",Not specified,Master,Not specified,Not specified


In [3]:
df.columns

Index(['Job Title', 'Company Name', 'Location', 'Skills',
       'Years of Experience', 'Education', 'Job Type', 'Salary'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304 entries, 0 to 303
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Job Title            304 non-null    object
 1   Company Name         304 non-null    object
 2   Location             304 non-null    object
 3   Skills               197 non-null    object
 4   Years of Experience  304 non-null    object
 5   Education            304 non-null    object
 6   Job Type             304 non-null    object
 7   Salary               304 non-null    object
dtypes: object(8)
memory usage: 19.1+ KB


In [5]:
df[["Job Title","Salary"]]

Unnamed: 0,Job Title,Salary
0,Data Scientist,94;500 - 115;500 a year
1,Data Scientist,Not specified
2,Data Scientist,Not specified
3,Data Scientist (Data Scientist 1),Not specified
4,Data Scientist,98;300 - 193;200 a year
...,...,...
299,Principal Data Scientist,175;812 - 296;688 a year
300,Sr. Data Scientist,108;306 - 196;000 a year
301,Senior Data Scientist,Not specified
302,Data Scientist II,Not specified


In [6]:
# Enhanced parse_salary function with debugging information
def parse_salary(salary):
    try:
        if 'a year' in salary:
            salary = salary.replace(' a year', '')
            if ' - ' in salary:
                salary_range = salary.split(' - ')
                low_salary = float(salary_range[0].replace(';', '').replace(',', ''))
                high_salary = float(salary_range[1].replace(';', '').replace(',', ''))
                return (low_salary + high_salary) / 2  # Average of the salary range
            else:
                return float(salary.replace(';', '').replace(',', ''))
        return np.nan
    except Exception as e:
        print(f"Error parsing salary '{salary}': {e}")
        return np.nan

In [7]:
df['Salary'] = df['Salary'].apply(parse_salary)
df[["Job Title","Salary"]]

Unnamed: 0,Job Title,Salary
0,Data Scientist,105000.0
1,Data Scientist,
2,Data Scientist,
3,Data Scientist (Data Scientist 1),
4,Data Scientist,145750.0
...,...,...
299,Principal Data Scientist,236250.0
300,Sr. Data Scientist,152153.0
301,Senior Data Scientist,
302,Data Scientist II,


In [8]:
df

Unnamed: 0,Job Title,Company Name,Location,Skills,Years of Experience,Education,Job Type,Salary
0,Data Scientist,DTN,Remote,"Machine Learning, Pandas, Analytics, Software ...",2,Not Specified,Full-time,105000.0
1,Data Scientist,Generac Power Systems,Pewaukee; WI 53188,"Power BI, Data Visualization, Computer Science...",3,Master; Bachelor,Full-time,
2,Data Scientist,Big Lots,Columbus; OH 43081 (Albany Commons area),"Data Mining, Computer Science, Data Structures...",Not specified,Master; Bachelor,Full-time,
3,Data Scientist (Data Scientist 1),HII,Remote in Alexandria; VA,"Data Mining, Machine Learning, Project Managem...",4,Bachelor,Full-time,
4,Data Scientist,Microsoft,Atlanta; GA,"Data Mining, Forecasting, Computer Science, AP...",2,Doctorate; Master; Bachelor,Full-time,145750.0
...,...,...,...,...,...,...,...,...
299,Principal Data Scientist,Lawrence Berkeley National Laboratory,Hybrid work in San Francisco Bay Area; CA,"Data Analysis, Research, Machine Learning",10,Doctorate,Full-time,236250.0
300,Sr. Data Scientist,CVS Health,Remote in New York; NY,"Data Mining, TensorFlow, Natural Language Proc...",Not specified,Master,Full-time,152153.0
301,Senior Data Scientist,Providence,Remote in Renton; WA 98057,"Power BI, Data Visualization, Forecasting, Dat...",6,Master,Full-time,
302,Data Scientist II,Honeywell,Hybrid work in Kansas City; MO 64147,"Software development, Leadership, Machine Lear...",Not specified,Master,Not specified,


In [24]:
# Splitting the skills into individual entries and getting unique values
unique_skills = df['Skills'].str.split(', ').explode().unique()

print("Unique skills: \n\n\n",unique_skills)

Unique skills: 


 ['Machine Learning' 'Pandas' 'Analytics' 'Software development' 'Research'
 'Github' 'NumPy' 'AI' 'Python' 'Power BI' 'Data Visualization'
 'Computer Science' 'R' 'Azure' 'Sales' 'Statistical Analysis'
 'Statistics' 'Data Analytics' 'SQL' 'Data Mining' 'Data Structures'
 'Data Warehouse' 'Marketing' 'Quantitative analysis' 'Project Management'
 'Agile' 'Big Data' 'Forecasting' 'APIs' 'Java' 'Cloud Computing'
 'Regression Analysis' 'A/B Testing' 'Leadership' 'Tableau' 'Excel'
 'Machine Learning Algorithms' 'Git' 'TensorFlow' 'Spark' 'MongoDB'
 'Scripting' 'Snowflake' 'Scikit-learn' 'PyTorch' 'NoSQL' 'Unix'
 'Distributed Systems' 'MATLAB' 'Data Analysis'
 'Natural Language Processing' 'Data Engineering' 'AWS' 'SAS' 'Banking'
 'Hadoop' 'Financial Services' 'Go' 'Risk Management' 'Google Cloud'
 'Statistical Software' 'Julia' 'Keras' 'Scala' 'Deep Learning'
 'Computer Vision' 'Pig' 'Presentation Skills' 'MLOps' 'Linux'
 'Hypothesis testing' 'HBase' 'Kafka' 'SPSS' nan 'Ma

In [22]:
# Unique Years 

unique_years = df['Years of Experience'].unique()
print("Unique years of experience: \n\n\n",unique_years)

Unique years of experience: 


 ['2' '3' 'Not specified' '4' '5' '1' '6' '13' '7' '10' '8' '240' '15' '16'
 '12' '60' '100' '21' '80' '70']


In [21]:
# Unique Salaries 

unique_salary = df['Salary'].unique()
print("Unique salaries: \n\n\n",unique_salary)

Unique salaries: 


 [105000.       nan 145750.   92500.  103950.  160305.5  87200.  129007.5
 132500.  100464.   95650.  123800.  147358.5 171000.  139153.  145250.
 182700.  130858.5 135587.5 130600.  150000.  210000.  112404.  197700.
 133000.  154000.  121500.  142500.  164760.  115000.  176500.5 145000.
 163238.  136000.  152153.  152250.  133600.   95000.  155000.   50000.
 105100.  103500.   65000.   78567.5 102750.  141000.  175000.  118125.
 113800.  122500.  165000.  120000.  168000.   62515.  107900.  137980.
 154153.  125000.  104835.  315000.  162450.5 200000.  110000.  138202.5
  92000.  196125.  236250.  115100. ]


In [20]:
# Unique Education 

unique_education  = df['Education'].unique()
print("Unique education levels: \n\n\n",unique_education) 

Unique education levels: 


 ['Not Specified' 'Master; Bachelor' 'Bachelor'
 'Doctorate; Master; Bachelor' 'Master' 'Doctorate; Master' 'Doctorate'
 'Doctorate; Bachelor']


In [19]:
# Unique Job titles  

unique_job_title  = df['Job Title'].unique()
print("Unique job titles: \n\n\n",unique_job_title) 

Unique job titles: 


 ['Data Scientist' 'Data Scientist (Data Scientist 1)'
 'Shortage Data Scientist I' 'Associate Data Scientist'
 'Data Scientist; Provider Selection Methodology'
 'Senior Data Scientist; Risk Analytics' 'Sr Azure Data Analyst'
 'Data Scientist- Remote' 'Data Scientist I (US)' 'Data Analyst; Senior'
 'Sr Data Scientist' 'Sr. Data Analyst' 'Sr. Data Scientist'
 'Data Scientist (Expert) - Data & Analytics - IT - CS - US'
 'Senior Data Scientist' 'Associate Data Scientist - PLMI Analytics'
 'Senior Data Analyst' 'Data Scientist; Confluence'
 'Data Scientist Bioinformatics' 'Sr Data Analyst - Remote'
 'Senior; Data Scientist' 'Sr Data Analyst'
 'Data Scientist - Enterprise Analytics'
 'Data Scientist or Data Scientist Senior' 'Data Scientist I'
 'Sr. Data Analyst - Remote' 'Corporate Analytics Data Scientist'
 'Sr Data Scientist - Demand Forecasting'
 'Senior Data Scientist - Support' 'Data Scientist II'
 'Data Scientist I/II' 'Senior Healthcare Data Analyst (HEDIS)'
 '

In [17]:
# Unique Job types  

unique_job_type  = df['Job Type'].unique()
print("Unique Job types:",unique_job_type) 

Unique Job types: ['Full-time' 'Not specified' 'Part-time']


In [31]:
# Removing the rows with NaN values

df_cleaned = df.dropna(subset=['Skills', 'Years of Experience', 'Salary'])
print(df_cleaned[['Skills', 'Years of Experience', 'Salary']])

# Unique Years 

unique_years_clean = df_cleaned['Years of Experience'].unique()
print("\n\n\nUnique years of experience in the clean dataset: \n\n\n",unique_years_clean)

# Unique Skills 

unique_skills_clean = df_cleaned['Skills'].unique()
print("\n\n\nUnique skills in the clean dataset : \n\n\n",unique_skills_clean)

# Unique Salary

unique_salary_clean = df_cleaned['Salary'].unique()
print("\n\n\nUnique salaries in the clean dataset : \n\n\n",unique_salary_clean)

                                                Skills Years of Experience  \
0    Machine Learning, Pandas, Analytics, Software ...                   2   
4    Data Mining, Forecasting, Computer Science, AP...                   2   
6    Forecasting, Leadership, R, Analytics, Tableau...                   5   
8    Git, TensorFlow, Spark, Computer Science, Mong...                   2   
9    Data Mining, Computer Science, Machine Learnin...       Not specified   
..                                                 ...                 ...   
294  Project Management, Snowflake, Analytics, Tabl...                   5   
298  Docker, Spark, Machine Learning, R, Analytics,...       Not specified   
299          Data Analysis, Research, Machine Learning                  10   
300  Data Mining, TensorFlow, Natural Language Proc...       Not specified   
303  Computer Science, Risk Management, Machine Lea...       Not specified   

       Salary  
0    105000.0  
4    145750.0  
6     92500.0  

In [29]:
# Remove rows with 'Not specified' in 'Years of Experience' and 'Salary'

df_cleaned = df_cleaned[(df_cleaned['Years of Experience'] != 'Not specified') & 
                        (df_cleaned['Salary'].notna())]