In [1]:
import pandas as pd
import numpy as np
import re

df_DA = pd.read_csv('DataAnalystJobData_scrappeddata.csv')
df_JD = pd.read_csv('JobData_scrappeddata.csv')

#concatenate
df_concatenated = pd.concat([df_DA, df_JD], axis=0)
# Reset the index for a continuous index
df_concatenated = df_concatenated.reset_index(drop=True)
df_concatenated.shape

(1122, 15)

In [2]:
def extract_salary(row):
    text = str(row['Benefits'])  # Convert the Benefits column to a string
#     print(f"Processing text: {text}")  # Debug: print the text being processed
    
    #Exclude '401K', '401(k)', and standalone '401' by removing these from the text
    text = re.sub(r'401[Kk]?(\(k\))?', '', text)  # Remove all variations of '401K', '401(k)', and '401'
    
    #Define regex patterns to match salary ranges and values
    patterns = [
        r'\b(?:salary|range|pay|compensation|base)\b.*?\$(\d{1,3}(?:,\d{3})?)\s*-\s*\$(\d{1,3}(?:,\d{3})?)',  # Range with $ and salary-related keywords
        r'\b(?:salary|pay|compensation|base)\b.*?\$(\d{1,3})[Kk]',  # Single value with 'K'
        r'\b(?:salary|pay|compensation|base)\b.*?(\d{1,3})[Kk]',  # Single value without '$' but with 'K'
        r'\$(\d{1,3}(?:,\d{3})*)',  # Single value with $
        r'(\d{1,3}(?:,\d{3})*)',  # Single value without $
    ]
    
    salary_values = []  # List to store extracted salary values

    for pattern in patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)  # Find all matches for the current pattern
#         print(f"Matches for pattern '{pattern}': {matches}")  # Debug: print matches for each pattern
        
        for match in matches:
            if isinstance(match, tuple):  # Handle ranges
                low, high = match
                low = low.replace(',', '')
                high = high.replace(',', '')
                
                # Check if 'K' is present and multiply by 1,000 if necessary
                low_value = float(low) * 1000 if 'K' in text else float(low)
                high_value = float(high) * 1000 if 'K' in text else float(high)
                
                # Calculate the average of the range
                salary_range_mean = (low_value + high_value) / 2
                salary_values.append(salary_range_mean)
#                 print(f"Range found: {match}, Mean: {salary_range_mean}")  # Debug: print the range and mean value
                
            else:  # Handle single values
                match = match.replace(',', '')  # Remove commas
                
                # If 'K' is present immediately after the number, multiply by 1,000
                if 'K' in text:
                    salary_value = float(match) * 1000
                else:
                    salary_value = float(match)

                salary_values.append(salary_value)  # Add the single value to the list
#                 print(f"Single value found: {match}, Converted: {salary_value}")  # Debug: print the found single value

    # Get the largest salary from the collected values
    if salary_values:
        max_salary = max(salary_values)
#         print(f"Max salary found: {max_salary}")  # Debug: print the max salary found
        return max_salary  # Return the maximum salary found

    return None  # If no salaries were found


In [3]:
def process_salary_range(text):
    # Convert the input to a string and strip any surrounding spaces
    text = str(text).strip()
#     print(f"Processing text: {text}")  # Debugging output to show what we're processing
    
    # Match any number with optional commas and optional decimals
    if re.match(r'\d+', text):
        try:
            salary_value = float(text)  # Try to convert the string to a float
            annual_salary = annual_salary_cal(salary_value)
            return annual_salary
        except ValueError:
            #Replace the special character 'â€“' with a standard dash '-'
            text = text.replace('â€“', '-').replace('–', '-')  # Replace all special dashes

            # Handle salary ranges and single values
            range_pattern = r'\$?(\d{1,3}(?:,\d{3})?(?:[Kk])?)\s*-\s*\$?(\d{1,3}(?:,\d{3})?(?:[Kk])?)'
            single_value_pattern = r'\$?(\d{1,3}(?:,\d{3})?(?:\.\d{1,2})?(?:[Kk])?)'

            match_range = re.search(range_pattern, text)
            match_single = re.search(single_value_pattern, text)

            #If it's a range, process and calculate the mean
            if match_range:
                low, high = match_range.groups()
                low = low.replace(',', '')
                high = high.replace(',', '')

                if 'K' in low.upper():
                    low = low.replace('K', '000')
                if 'K' in high.upper():
                    high = high.replace('K', '000')

                salary_mean = (float(low) + float(high)) / 2
                if 'hour' in text.lower():
                    return salary_mean * 40 * 52  # Convert hourly rate to annual
                elif 'week' in text.lower():
                    return salary_mean * 52  # Convert weekly rate to annual
                else:
                    return salary_mean  # Return mean for yearly values

            #If it's a single value, process it 120K a year
            elif match_single:
                value = match_single.group(1).replace(',', '')
                if 'K' in value.upper():
                    value = value.replace('K', '000')

                if 'hour' in text.lower():
                    return float(value) * 40 * 52  # Convert hourly rate to annual
                elif 'week' in text.lower():
                    return float(value) * 52  # Convert weekly rate to annual
                else:
                    return float(value)  # Return the single yearly value as a float

            #If no matches, return the original text unchanged
            return text

In [4]:
def annual_salary_cal(salary_value):
        # Check if salary is between 1 and 200 (hourly rate)
        if 1 <= salary_value <= 200:
            return salary_value * 40 * 52  # Convert hourly to annual salary
        # Check if salary is between 201 and 2,000 (weekly rate)
        elif 201 <= salary_value <= 2000:
            return salary_value * 52  # Convert weekly to annual salary
        # Check if salary is between 2,000 and 8,000 (monthly rate)
        elif 2000 <= salary_value <= 8000:
            return salary_value * 12  # Convert monthly to annual salary
        else:
            return salary_value  # Return the salary as it is (annual)

In [5]:
# Extract skills from the 'Qualifications' column
def extract_skills(text, skills):
    if pd.isna(text):
        return ''
    found_skills = [skill for skill in skills if re.search(r'\b' + re.escape(skill) + r'\b', text, re.IGNORECASE)]
    return ', '.join(found_skills)

# Extract years of experience from the 'Qualifications' column
def extract_years_of_experience(text):
    if pd.isna(text):
        return None
    
    patterns = [
        r'(\d+)\+?\s*(?:years?|yrs?)\s*(?:of)?\s*(?:experience|exp)',
        r'(\d+)\+?\s*(?:years?|yrs?)\s*(?:of)?\s*(?:relevant|related)\s*(?:experience|exp)',
        r'(\d+)-(\d+)\s*(?:years?|yrs?)\s*(?:of)?\s*(?:experience|exp)',
        r'minimum\s*(?:of)?\s*(\d+)\s*(?:years?|yrs?)\s*(?:of)?\s*(?:experience|exp)',
        r'at least\s*(\d+)\s*(?:years?|yrs?)',
        r'(\d+)\+?\s*(?:years?|yrs?)',
        r'(\d+)\s*(?:to|–|-)\s*(\d+)\s*(?:years?|yrs?)',
        r'(?:bachelor|master|phd).*?(\d+)\+?\s*(?:years?|yrs?)',
        r'(\d+)\+?\s*(?:years?|yrs?).*?(?:bachelor|master|phd)'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return f"{match.group(1)}-{match.group(2)}" if len(match.groups()) == 2 else match.group(1)
    return None


In [6]:
# Process the DataFrame
def process_dataframe(df):
    skills = ['Python', 'Hive', 'SQL', 'MySQL', 'Oracle', 'Tableau', 'Snowflake', 'Redshift', 'Big Data', 'Spark', 'AWS', 'GCP', 'Azure', 'Java', 'Cloud', 'Data Analytics', 'Analytics', 'ETL', 'Business Intelligence', 'data warehouse', 'Power BI','Excel', 'Data Visualization', 'Visualization']
    df['Education'] = df['Qualifications'].apply(extract_education)
    df['Years of Experience Required'] = df['Qualifications'].apply(extract_years_of_experience)
    df['Salary_New'] = df['Salary']
    mask = df['Salary'].isnull()
    df.loc[mask, 'Salary_New'] = df[mask].apply(extract_salary, axis=1)
    df.loc[df['Salary_New'].astype(str).str.contains('401', na=False), 'Salary_New'] = None
    df['Skills'] = df['Qualifications'].apply(lambda x: extract_skills(x, skills))
    # Apply the function to the 'Salary_New' column, skipping rows where the result is None
    df['Salary_New_Processed'] = df['Salary_New'].apply(lambda x: process_salary_range(x) if pd.notnull(x) else x)
    df['JobTitle_New'] = df['Job title'].apply(job_title_categ)
    return df


In [7]:
def extract_education(text):
    if pd.isna(text):
        return "Any Degree"
    # Regex pattern for capturing degree keywords (case insensitive)
    pattern = r'\b(PHD|PH\.D|Post\s?Grad|Master\'?s?|Bachelors?\'?)\b'
        # Search for matching degree keywords in the text
    matches = re.findall(pattern, text, re.IGNORECASE)
    # Set precedence order
    precedence = ['PHD', 'PH.D', 'Post Grad', 'Master', "Master's", "Masters", 'Bachelor', "Bachelor's", "Bachelors"] 
    # If there are matches, return the highest precedence degree
    if matches:
        # Sort based on the order of precedence
        for degree in precedence:
            for match in matches:
                if re.search(degree, match, re.IGNORECASE):
                    if degree.lower().startswith("p"):
                        return f"PHD"
                    if degree.lower().startswith("b"):
                        return f"Bachelors"
                    if degree.lower().startswith("m"):
                        return f"Masters"
                    return degree  # Return the first match based on precedence
    # If no match found, return "Any Degree"
    return f"Any Degree"

In [8]:
def job_title_categ(title):
    if isinstance(title, str):  # Check if the title is a string
        title = title.lower()  # Convert to lowercase for uniformity
        
        # Check for Data Engineer roles (allowing any number of words between 'data' and 'engineer')
        if re.search(r'data.*engineer', title):
            return 'Data Engineer'
        
        # Check for Data Scientist roles (allowing any number of words between 'data' and 'science/scientist')
        elif re.search(r'data.*(science|scientist)', title):
            return 'Data Scientist'
        
        # Check for Business Analyst roles
        elif re.search(r'business.*analyst', title):
            return 'Business Analyst'
        
        # Check for Business Intelligence Analyst roles (allowing any number of words between 'business' and 'intelligence')
        elif re.search(r'business.*intelligence', title):
            return 'Business Intelligence Analyst'
        
        # Check for Data Analyst roles (matching data with analytics/analysis)
        elif re.search(r'data.*(analytics|analysis|analyst)', title):
            return 'Data Analyst'
        
        else:
            return None


In [9]:
df_textcleaned = process_dataframe(df_concatenated)
print(df_textcleaned.head(2))

                                                 URL     Job title  \
0  https://www.google.com/search?q=data+analyst+j...  Data Analyst   
1  https://www.google.com/search?q=data+analyst+j...  Data Analyst   

                                    Job sub-headings               Company  \
0  Disney Entertainment • Santa Monica, CA •  via...  Disney Entertainment   
1         Tax Rise • Newport Beach, CA •  via Adzuna              Tax Rise   

            Location          Source        Posted       Type  \
0   Santa Monica, CA  Disney Careers   14 days ago  Full-time   
1  Newport Beach, CA          Adzuna  19 hours ago  Full-time   

                                 Job additional data  \
0   ['14 days ago', 'Full-time', 'Health insurance']   
1  ['19 hours ago', '80K–95K a year', 'Full-time'...   

                                     Job Description  ...  \
0  About the Role & TeamWe’re looking for an inte...  ...   
1  INTRO TO TAXRISE:At TaxRise, our mission is si...  ...   

     

In [10]:
#Dropping the unnecessary columns from Data Analyst job data
df_textcleaned = df_textcleaned.drop(['URL','Job title','Job sub-headings','Posted','Job additional data','Job Description', 
                               'Qualifications','Benefits','Responsibilities','Salary','No Degree Mentioned', 'Salary_New'], axis=1)
print(df_textcleaned.head(2))

                Company           Location          Source       Type  \
0  Disney Entertainment   Santa Monica, CA  Disney Careers  Full-time   
1              Tax Rise  Newport Beach, CA          Adzuna  Full-time   

    Education Years of Experience Required                Skills  \
0  Any Degree                            3                   SQL   
1  Any Degree                            2  SQL, Azure, Power BI   

   Salary_New_Processed  JobTitle_New  
0              125200.0  Data Analyst  
1               87500.0  Data Analyst  


In [11]:
#converting the full state names and city names to two-letter Shortforms
def clean_location(location):
    # Mapping of full state names and specific locations to abbreviations
    state_mapping = {
        'California': 'CA',
        'New York': 'NY',
        'Texas': 'TX',
        'Washington': 'WA',
        'Florida': 'FL',
        'Massachusetts': 'MA',
        'Illinois': 'IL',
        'United States': 'USA',
        'Pennsylvania': 'PA',
        'Nevada': 'NV',
        'Alabama':'AL',
        'Michigan':'MI' ,
        'Fort Lewis':'WA',
        'Georgia':'GA',
        'Arizona':'AZ' ,
        'Illnois':'IL',
        'Connecticut':'CT',
        'Minnesota':'MN',
        'Kentucky':'KY',
        'North Carolina': 'NC',
        'New Jersey': 'NJ'
    }
    
    # Check if the location is a string
    if isinstance(location, str):
        # Split the location string and check for abbreviations or full names
        parts = location.split(', ')
        if len(parts) > 1:
            # Check if the last part is an abbreviation or full state name
            state_abbr = parts[-1].strip()
            return state_mapping.get(state_abbr, state_abbr)  # Return abbreviation if found
        else:
            # Attempt to map full state names to abbreviations
            return state_mapping.get(location.strip(), location.strip())  # Return original if no match
    else:
        return pd.NA  # Return NA for non-string values

# Clean the 'Location' column
df_textcleaned['Location'] = df_textcleaned['Location'].apply(clean_location)
# Display the first few rows of the cleaned DataFrame
print(df_textcleaned.head(2))

                Company Location          Source       Type   Education  \
0  Disney Entertainment       CA  Disney Careers  Full-time  Any Degree   
1              Tax Rise       CA          Adzuna  Full-time  Any Degree   

  Years of Experience Required                Skills  Salary_New_Processed  \
0                            3                   SQL              125200.0   
1                            2  SQL, Azure, Power BI               87500.0   

   JobTitle_New  
0  Data Analyst  
1  Data Analyst  


In [12]:
#Changing the combination of Full-Time, Part-time and Contractor Positions to Part-time and 
#Part-time and Contractor to Part-time and Full-time and Internship to Internship
def clean_type(job_type):
    if pd.isna(job_type):
        return job_type  

    job_type = job_type.lower()  # Convert to lowercase for consistent comparisons
    
    # Apply the transformation rules
    if 'internship' in job_type:
        return 'Internship'
    elif 'part-time' in job_type or 'contractor' in job_type:
        return 'Part-time'
    elif 'full-time' in job_type:
        return 'Full-time'
    else:
        return job_type  # Return the original job_type if it doesn't match any condition

# Apply the clean_type function to the 'Type' column
df_textcleaned['Type'] = df_textcleaned['Type'].apply(clean_type)

# Print a summary of the cleaned 'Type' column
print(df_textcleaned['Type'].value_counts())

Type
Full-time     905
Part-time     178
Internship     15
Name: count, dtype: int64


In [13]:
#Applying median Values to Missing and Nan salaries as per JobTitle_New
def calculate_median_salaries(df):
       return df.groupby('JobTitle_New')['Salary_New_Processed'].median()

def fill_missing_salaries(df, median_salaries):
    def fill_salary(row):
        if pd.isna(row['Salary_New_Processed']):
            return median_salaries.get(row['JobTitle_New'], np.nan)
        return row['Salary_New_Processed']  
    
    df['Salary_New_Processed'] = df.apply(fill_salary, axis=1)
    return df

# Calculate median salaries
median_salaries = calculate_median_salaries(df_textcleaned)
    
# Fill missing salaries
df_textcleaned = fill_missing_salaries(df_textcleaned, median_salaries)
    
# Print summary
print("Median Salaries by Job Title:")
print(median_salaries)
print("\nNumber of missing salaries after filling:", df_textcleaned['Salary_New_Processed'].isna().sum()) #the rows without job titles

Median Salaries by Job Title:
JobTitle_New
Business Analyst                  91520.0
Business Intelligence Analyst    158230.0
Data Analyst                     100000.0
Data Engineer                    166000.0
Data Scientist                   195000.0
Name: Salary_New_Processed, dtype: float64

Number of missing salaries after filling: 79


In [14]:
#removing Outliers and rows with no job type but with salary

def remove_salary_outliers_and_no_jobtype(df, salary_column, job_column, max_limit):
    # Create a copy of the DataFrame to avoid modifying the original
    df_cleaned = df.copy()
    
    # Remove outliers
    df_cleaned = df_cleaned[df_cleaned[salary_column] <= max_limit]
    
    # Remove rows with no job type but with salary value
    df_cleaned = df_cleaned[~((df_cleaned[job_column].isna() | (df_cleaned[job_column] == '')) & df_cleaned[salary_column].notna())]
    
    return df_cleaned

# Apply the function to remove outliers and rows with no job type but with salary
max_salary_limit = 600000
df_cleaned = remove_salary_outliers_and_no_jobtype(df_textcleaned, 'Salary_New_Processed', 'JobTitle_New', max_salary_limit)

# Print summary statistics to verify the changes
print("Original DataFrame:")
print(df_textcleaned['Salary_New_Processed'].describe())
print(f"Original number of rows: {len(df_textcleaned)}")

print("\nCleaned DataFrame:")
print(df_cleaned['Salary_New_Processed'].describe())
print(f"Cleaned number of rows: {len(df_cleaned)}")

# Print the number of rows removed
rows_removed = len(df_textcleaned) - len(df_cleaned)
print(f"\nTotal number of rows removed: {rows_removed}")

# Print the number of rows removed due to no job type but with salary
rows_removed_no_jobtype = len(df_textcleaned[((df_textcleaned['JobTitle_New'].isna() | (df_textcleaned['JobTitle_New'] == '')) & df_textcleaned['Salary_New_Processed'].notna())])
print(f"Number of rows removed due to no job type but with salary: {rows_removed_no_jobtype}")

Original DataFrame:
count    1.043000e+03
mean     6.472463e+05
std      9.783008e+06
min      2.080000e+03
25%      9.152000e+04
50%      1.000000e+05
75%      1.660000e+05
max      2.158000e+08
Name: Salary_New_Processed, dtype: float64
Original number of rows: 1122

Cleaned DataFrame:
count       999.000000
mean     123914.412412
std       60404.855389
min        2080.000000
25%       91520.000000
50%      100000.000000
75%      166000.000000
max      416000.000000
Name: Salary_New_Processed, dtype: float64
Cleaned number of rows: 999

Total number of rows removed: 123
Number of rows removed due to no job type but with salary: 39


In [15]:
def calculate_median_experience(df):
    full_time_median = df[df['Type'] == 'Full-time']['Years of Experience Required'].median()
    part_time_median = df[df['Type'] == 'Part-time']['Years of Experience Required'].median()
    return full_time_median, part_time_median

def update_experience_required(row, full_time_median, part_time_median):
    if pd.isna(row['Years of Experience Required']):
        if row['Type'] == 'Full-time':
            return full_time_median
        elif row['Type'] == 'Part-time':
            return part_time_median
        elif row['Type'] == 'Internship':
            return 0
    return row['Years of Experience Required']

def clean_data(df):
    full_time_median, part_time_median = calculate_median_experience(df)
    
    df['Years of Experience Required'] = df.apply(
        lambda row: update_experience_required(row, full_time_median, part_time_median), 
        axis=1
    )
    
    return df

df_cleaned = clean_data(df_cleaned)  

print("Summary of Years of Experience Required by Type:")
print(df_cleaned.groupby('Type')['Years of Experience Required'].describe())
    
print("\nSample of updated data:")
print(df_cleaned[['Type', 'Years of Experience Required']].sample(10))

Summary of Years of Experience Required by Type:
            count  unique  top   freq
Type                                 
Full-time   816.0    18.0  5.0  349.0
Internship   14.0     3.0  0.0   11.0
Part-time   164.0    12.0  5.0   76.0

Sample of updated data:
           Type Years of Experience Required
579   Full-time                           10
1027  Full-time                          5.0
325   Full-time                          5.0
281   Full-time                          5.0
550   Full-time                          5.0
1104  Full-time                            8
494   Part-time                          5.0
742   Full-time                            4
369   Full-time                          5.0
904   Part-time                            5


In [16]:
#Converting all the job sources to company websites other than LinkedIn, Indeed, ZipRecruiter, Glassdoor
def clean_source(Source):
    
    allowed_sources = ["LinkedIn", "Indeed", "ZipRecruiter", "Glassdoor"]
    if Source in allowed_sources:
        return Source
    else:
        return "Company Website"

def apply_cleaning_to_source_column(df):
    
    df['Source'] = df['Source'].apply(clean_source)
    return df
 
# Apply cleaning to the Source column
df_cleaned = apply_cleaning_to_source_column(df_cleaned)
    
# Print summary of the cleaned Source column
print("Summary of cleaned Source column:")
print(df_cleaned['Source'].value_counts())
    
# Print a sample of the cleaned data
print("\nSample of cleaned Source data:")
print(df_cleaned[['Source']].sample(10))

Summary of cleaned Source column:
Source
Company Website    503
LinkedIn           300
ZipRecruiter       110
Glassdoor           50
Indeed              36
Name: count, dtype: int64

Sample of cleaned Source data:
               Source
642   Company Website
331          LinkedIn
330          LinkedIn
851   Company Website
797          LinkedIn
1009  Company Website
1054           Indeed
197   Company Website
387   Company Website
907   Company Website


In [17]:
# Save the cleaned DataFrame back to a CSV file
df_cleaned.to_csv('JobData_Cleaned.csv', index=False)

#final clened data frame
JobData_df = df_cleaned

# Determine dimensions of dataframe. 
print(JobData_df.shape) # It has 999rows and 9 columns
print(JobData_df.head(5))#First 5 rows

(999, 9)
                Company Location           Source       Type   Education  \
0  Disney Entertainment       CA  Company Website  Full-time  Any Degree   
1              Tax Rise       CA  Company Website  Full-time  Any Degree   
2               PragerU       CA         LinkedIn  Full-time  Any Degree   
3  Disney Entertainment       CA  Company Website  Full-time  Any Degree   
4  Keck Medicine of USC       CA  Company Website  Full-time   Bachelors   

  Years of Experience Required  \
0                            3   
1                            2   
2                            4   
3                            3   
4                            4   

                                              Skills  Salary_New_Processed  \
0                                                SQL              125200.0   
1                               SQL, Azure, Power BI               87500.0   
2  Python, SQL, Analytics, Business Intelligence,...              150000.0   
3                

In [18]:
# Create dummy variables for the 'Location' column
location_dummies = pd.get_dummies(JobData_df['Location'], prefix='Location', dtype=int)
# Create dummy variables for the 'Source' column
source_dummies = pd.get_dummies(JobData_df['Source'], prefix='Source', dtype=int)
# Create dummy variables for the 'Location' column
type_dummies=pd.get_dummies(JobData_df['Type'], prefix='Type', dtype=int)
# Create dummy variables for the 'Location' column
education_dummies = pd.get_dummies(JobData_df['Education'],prefix='Education',dtype = int)
# Create dummy variables for the 'Location' column
jobtitle_dummies = pd.get_dummies(JobData_df['JobTitle_New'],prefix='JobTitle_New',dtype = int)
# Combine the original data frame with both sets of dummy variables
JobData_with_dummies = pd.concat([
    JobData_df.drop(['Location', 'Source','Type','Education','JobTitle_New'], axis=1),
    location_dummies,
    source_dummies,
    type_dummies,
    education_dummies,
    jobtitle_dummies
], axis=1)

# Display the first few rows of the updated DataFrame
print("\nUpdated DataFrame with both Location and Source dummy variables:")
print(JobData_with_dummies.head())


Updated DataFrame with both Location and Source dummy variables:
                Company Years of Experience Required  \
0  Disney Entertainment                            3   
1              Tax Rise                            2   
2               PragerU                            4   
3  Disney Entertainment                            3   
4  Keck Medicine of USC                            4   

                                              Skills  Salary_New_Processed  \
0                                                SQL              125200.0   
1                               SQL, Azure, Power BI               87500.0   
2  Python, SQL, Analytics, Business Intelligence,...              150000.0   
3                                                SQL              125200.0   
4                                           SQL, ETL              158230.0   

   Location_AL  Location_AR  Location_AZ  Location_CA  Location_CO  \
0            0            0            0            1     

In [19]:
# Create dummy variables
# Replace spaces with underscores in column names
JobData_with_dummies.columns = JobData_with_dummies.columns.str.replace(' ', '_')
JobData_with_dummies.columns = JobData_with_dummies.columns.str.replace('-', '_')

# Display the data types
print(JobData_with_dummies.columns)

Index(['Company', 'Years_of_Experience_Required', 'Skills',
       'Salary_New_Processed', 'Location_AL', 'Location_AR', 'Location_AZ',
       'Location_CA', 'Location_CO', 'Location_CT', 'Location_DC',
       'Location_DE', 'Location_FL', 'Location_GA', 'Location_IA',
       'Location_ID', 'Location_IL', 'Location_IN', 'Location_KS',
       'Location_KY', 'Location_LA', 'Location_MA', 'Location_MD',
       'Location_MI', 'Location_MN', 'Location_MO', 'Location_MS',
       'Location_MT', 'Location_NC', 'Location_ND', 'Location_NE',
       'Location_NH', 'Location_NJ', 'Location_NM', 'Location_NV',
       'Location_NY', 'Location_OH', 'Location_OK', 'Location_OR',
       'Location_PA', 'Location_RI', 'Location_SC', 'Location_SD',
       'Location_TN', 'Location_TX', 'Location_USA', 'Location_UT',
       'Location_VA', 'Location_WA', 'Location_WI', 'Location_WV',
       'Source_Company_Website', 'Source_Glassdoor', 'Source_Indeed',
       'Source_LinkedIn', 'Source_ZipRecruiter', 'Type_F