In [1]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructType, StructField, Row, IntegerType, ArrayType
# spark = SparkSession.builder.appName('Iteration_4').getOrCreate()

spark = SparkSession.builder \
    .appName('Iteration_4') \
    .config('spark.executor.memory', '8g') \
    .config('spark.driver.memory', '8g') \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.executor.heartbeatInterval", "60s") \
    .config("spark.network.timeout", "600s") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/18 04:20:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Let's read in the data. Note that it's in the format of JSON.
df01 = spark.read.csv('Datasets/job_postings.csv', header=True, inferSchema=True)
df02 = spark.read.csv('Datasets/job_skills.csv', header=True, inferSchema=True)
df01_count = df01.count()
df02_count = df02.count()
(df01_count,df02_count)

                                                                                

(12218, 12217)

## Use pandas to try again

In [3]:
import pandas as pd

job_postings_df = pd.read_csv('Datasets/job_postings.csv', dtype=str)
job_skills_df = pd.read_csv('Datasets/job_skills.csv', dtype=str)

job_postings_count = len(job_postings_df)
job_skills_count = len(job_skills_df)
(job_postings_count,job_skills_count)

(12217, 12217)

In [4]:
common_columns = list(set(df01.columns).intersection(set(df02.columns)))

# Find common columns
df01_common = df01.select(common_columns)
df02_common = df02.select(common_columns)

# Gain rows
df01_count = df01_common.count()
df02_count = df02_common.count()
print(f'df01行数: {df01_count}, df02行数: {df02_count}')

# Find extral rows
diff_df = df01.join(df02, on='job_link', how='left_anti')

# Display extral rows
print("Display extral rows:")
diff_df.show()

# Delete extral rows
df01_filtered = df01.subtract(diff_df)

df01_filtered.describe()

df01行数: 12218, df02行数: 12217
Display extral rows:
+--------------------+--------------------+--------------------+-----------+-------+---------------+--------------------+----------+------------+----------+-----------+--------------+---------------+---------+--------+
|            job_link| last_processed_time|         last_status|got_summary|got_ner|is_being_worked|           job_title|   company|job_location|first_seen|search_city|search_country|search_position|job_level|job_type|
+--------------------+--------------------+--------------------+-----------+-------+---------------+--------------------+----------+------------+----------+-----------+--------------+---------------+---------+--------+
|                 ...|RED Engineering D...|London, England, ...| 2024-01-15| Slough| United Kingdom|Electrical Engine...|Mid senior|      Onsite|      null|       null|          null|           null|     null|    null|
+--------------------+--------------------+--------------------+----------

                                                                                

DataFrame[summary: string, job_link: string, last_processed_time: string, last_status: string, got_summary: string, got_ner: string, is_being_worked: string, job_title: string, company: string, job_location: string, first_seen: string, search_city: string, search_country: string, search_position: string, job_level: string, job_type: string]

## Data Exploration

In [5]:
# Visualise DataFrames df1 

df01_filtered.show()
df01_filtered.columns

24/05/18 04:20:58 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 28:>                                                         (0 + 1) / 1]

+--------------------+--------------------+------------+-----------+-------+---------------+--------------------+--------------------+--------------------+----------+--------------+--------------+--------------------+----------+--------+
|            job_link| last_processed_time| last_status|got_summary|got_ner|is_being_worked|           job_title|             company|        job_location|first_seen|   search_city|search_country|     search_position| job_level|job_type|
+--------------------+--------------------+------------+-----------+-------+---------------+--------------------+--------------------+--------------------+----------+--------------+--------------+--------------------+----------+--------+
|https://www.linke...|2024-01-19 12:25:...|Finished NER|          t|      t|              f|Senior MLOps Engi...|Recruiting from S...|        Evanston, IL|2024-01-14|      Palatine| United States|      Value Engineer|Mid senior|  Onsite|
|https://www.linke...|2024-01-19 18:06:...|Finis

                                                                                

['job_link',
 'last_processed_time',
 'last_status',
 'got_summary',
 'got_ner',
 'is_being_worked',
 'job_title',
 'company',
 'job_location',
 'first_seen',
 'search_city',
 'search_country',
 'search_position',
 'job_level',
 'job_type']

In [6]:
# Visualise DataFrames df2
df02.show()
df02.columns

+--------------------+--------------------+
|            job_link|          job_skills|
+--------------------+--------------------+
|https://www.linke...|Machine Learning,...|
|https://www.linke...|C++, Python, PyTo...|
|https://www.linke...|ETL, Data Integra...|
|https://www.linke...|Data Lakes, Data ...|
|https://www.linke...|Java, Scala, Pyth...|
|https://www.linke...|Data Warehouse (D...|
|https://www.linke...|Machine Learning,...|
|https://www.linke...|Data Loss Prevent...|
|https://www.linke...|Problem solving, ...|
|https://www.linke...|Machine Learning,...|
|https://www.linke...|SQL, Database Adm...|
|https://www.linke...|Master Data Analy...|
|https://www.linke...|Corporate Law, Pr...|
|https://www.linke...|SQL, AWS, DBA cas...|
|https://www.linke...|Machine Learning ...|
|https://uk.linked...|Data Engineering,...|
|https://ca.linked...|Data Analysis, Da...|
|https://www.linke...|Hardware Validati...|
|https://www.linke...|Data governance, ...|
|https://www.linke...|SQL Server

['job_link', 'job_skills']

In [7]:
# Convert Spark DataFrame to Pandas DataFrame
pd_df01 = df01_filtered.toPandas()
pd_df02 = df02.toPandas()
# Display summary statistics using Pandas
from IPython.display import display
display(pd_df01.describe())
display(pd_df02.describe())

  PyArrow >= 1.0.0 must be installed; however, it was not found.
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  PyArrow >= 1.0.0 must be installed; however, it was not found.
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.


Unnamed: 0,job_link,last_processed_time,last_status,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type
count,12217,12217,12217,12217,12217,12217,12217,12216,12215,12216,12216,12216,12216,12216,12216
unique,12217,5763,1,1,1,1,6484,3946,2049,6,801,4,711,2,3
top,https://www.linkedin.com/jobs/view/senior-mlop...,2024-01-19 09:45:09.215838+00,Finished NER,t,t,f,Senior Data Engineer,Jobs for Humanity,"New York, NY",2024-01-13,Greater London,United States,Data Entry Clerk,Mid senior,Onsite
freq,1,6455,12217,12217,12217,12217,285,732,280,4946,189,10291,1088,10918,12187


Unnamed: 0,job_link,job_skills
count,12217,12212
unique,12217,12204
top,https://www.linkedin.com/jobs/view/senior-mach...,"VolunteerMatch, LinkedIn for Good"
freq,1,3


In [8]:
# For type, we can use print schema. 
df01_filtered.printSchema()
df02.printSchema()

root
 |-- job_link: string (nullable = true)
 |-- last_processed_time: string (nullable = true)
 |-- last_status: string (nullable = true)
 |-- got_summary: string (nullable = true)
 |-- got_ner: string (nullable = true)
 |-- is_being_worked: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- company: string (nullable = true)
 |-- job_location: string (nullable = true)
 |-- first_seen: string (nullable = true)
 |-- search_city: string (nullable = true)
 |-- search_country: string (nullable = true)
 |-- search_position: string (nullable = true)
 |-- job_level: string (nullable = true)
 |-- job_type: string (nullable = true)

root
 |-- job_link: string (nullable = true)
 |-- job_skills: string (nullable = true)



## Data Manipulation

In [9]:
df01_clean = df01.na.drop(subset="job_link")
pd_df01_clean = df01_clean.toPandas()
display(pd_df01_clean.describe(include='all'))

  PyArrow >= 1.0.0 must be installed; however, it was not found.
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.


Unnamed: 0,job_link,last_processed_time,last_status,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type
count,12218,12218,12218,12218,12218,12218,12218,12217,12216,12216,12216,12216,12216,12216,12216
unique,12218,5764,2,2,2,2,6485,3947,2050,6,801,4,711,2,3
top,https://www.linkedin.com/jobs/view/senior-mach...,2024-01-19 09:45:09.215838+00,Finished NER,t,t,f,Senior Data Engineer,Jobs for Humanity,"New York, NY",2024-01-13,Greater London,United States,Data Entry Clerk,Mid senior,Onsite
freq,1,6455,12217,12217,12217,12217,285,732,280,4946,189,10291,1088,10918,12187


In [10]:
from pyspark.sql.functions import col, udf, split, explode, count, lit, when, lower, trim
import re
# Use .distinct() and .exceptAll() to check if there are any unmatched 'job_link' entries between the two DataFrames
different_links = df01_filtered.select("job_link").distinct().exceptAll(
    df02.select("job_link").distinct()
)

# Decide whether to merge based on the presence of unmatched 'job_link' entries
if different_links.count() == 0:
    # If there are no unmatched 'job_link' entries, proceed with merging
    merged_data = df01_filtered.join(df02, "job_link", "inner")
    print("DataFrames merged successfully.")
else:
    # If there are unmatched 'job_link' entries, do not merge
    print("DataFrames' job_link columns are not the same, cannot merge.")

if different_links.count() > 0:
    print("Unmatched job_link entries:")
    different_links.show()


                                                                                

DataFrames merged successfully.


                                                                                

In [11]:
merged_data.show()
merged_data.printSchema()

# Display summary statistics using Pandas
pd_merged_data = merged_data.toPandas()
display(pd_merged_data.describe())

+--------------------+--------------------+------------+-----------+-------+---------------+--------------------+--------------------+--------------------+----------+------------+--------------+--------------------+----------+--------+--------------------+
|            job_link| last_processed_time| last_status|got_summary|got_ner|is_being_worked|           job_title|             company|        job_location|first_seen| search_city|search_country|     search_position| job_level|job_type|          job_skills|
+--------------------+--------------------+------------+-----------+-------+---------------+--------------------+--------------------+--------------------+----------+------------+--------------+--------------------+----------+--------+--------------------+
|https://www.linke...|2024-01-21 08:08:...|Finished NER|          t|      t|              f|Senior Machine Le...|   Jobs for Humanity|       New Haven, CT|2024-01-14|  East Haven| United States|Agricultural-Rese...|Mid senior|  O

  PyArrow >= 1.0.0 must be installed; however, it was not found.
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
                                                                                

Unnamed: 0,job_link,last_processed_time,last_status,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_skills
count,12217,12217,12217,12217,12217,12217,12217,12216,12215,12216,12216,12216,12216,12216,12216,12212
unique,12217,5763,1,1,1,1,6484,3946,2049,6,801,4,711,2,3,12204
top,https://www.linkedin.com/jobs/view/senior-mach...,2024-01-19 09:45:09.215838+00,Finished NER,t,t,f,Senior Data Engineer,Jobs for Humanity,"New York, NY",2024-01-13,Greater London,United States,Data Entry Clerk,Mid senior,Onsite,"VolunteerMatch, LinkedIn for Good"
freq,1,6455,12217,12217,12217,12217,285,732,280,4946,189,10291,1088,10918,12187,3


In [12]:
df_clean = merged_data.na.drop()
# Display summary statistics using Pandas
pd_df_clean = df_clean.toPandas()
display(pd_df_clean.describe())

  PyArrow >= 1.0.0 must be installed; however, it was not found.
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.


Unnamed: 0,job_link,last_processed_time,last_status,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_skills
count,12210,12210,12210,12210,12210,12210,12210,12210,12210,12210,12210,12210,12210,12210,12210,12210
unique,12210,5756,1,1,1,1,6478,3945,2047,6,801,4,711,2,3,12202
top,https://www.linkedin.com/jobs/view/senior-mach...,2024-01-19 09:45:09.215838+00,Finished NER,t,t,f,Senior Data Engineer,Jobs for Humanity,"New York, NY",2024-01-13,Greater London,United States,Data Entry Clerk,Mid senior,Onsite,"data representation, data analysis, SQL, data ..."
freq,1,6455,12210,12210,12210,12210,285,732,280,4944,189,10286,1088,10913,12181,3


In [13]:
# 删除指定的列
columns_to_drop = ['last_processed_time', 'last_status', 'job_link', 'got_summary', 'got_ner',
                   'is_being_worked', 'company', 'first_seen', 'search_city', 'search_position', 'job_type']
filtered_df = merged_data.drop(*columns_to_drop)

# 显示过滤后的DataFrame
filtered_df.show()

+--------------------+--------------------+--------------+----------+--------------------+
|           job_title|        job_location|search_country| job_level|          job_skills|
+--------------------+--------------------+--------------+----------+--------------------+
|Senior Machine Le...|       New Haven, CT| United States|Mid senior|Machine Learning,...|
|Principal Softwar...|   San Francisco, CA| United States|Mid senior|C++, Python, PyTo...|
|Senior ETL Data W...|        New York, NY| United States| Associate|ETL, Data Integra...|
|Senior Data Wareh...|      Harrisburg, PA| United States|Mid senior|Data Lakes, Data ...|
|  Lead Data Engineer|           Plano, TX| United States|Mid senior|Java, Scala, Pyth...|
|Senior Data Engineer|         Chicago, IL| United States|Mid senior|Data Warehouse (D...|
|Manager, Cyber Ri...|          Boston, MA| United States|Mid senior|Machine Learning,...|
|Principal Associa...|        Scranton, PA| United States|Mid senior|Data Loss Prevent...|

In [14]:
# 定义函数
def split_location(location):
    if location is None:
        return Row(city=None, state=None, country=None)

    parts = [part.strip() for part in str(location).split(',')]
    city = state = country = None

    known_countries = ["United States", "United Kingdom", "Canada", "Australia", "India",
                       "Germany", "France", "Italy", "Spain", "Mexico"]
    us_states = {
        'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California',
        'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia',
        'HI': 'Hawaii', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa',
        'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland',
        'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi', 'MO': 'Missouri',
        'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire', 'NJ': 'New Jersey',
        'NM': 'New Mexico', 'NY': 'New York', 'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio',
        'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
        'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont',
        'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming', 'DC': 'Washington'
    }
    # Create a reverse map from full state name to abbreviation
    state_abbrev = {v: k for k, v in us_states.items()}

    # If there is only one part
    if len(parts) == 1:
        if parts[0] in known_countries:
            country = parts[0]
        else:
            state = parts[0]  # Translate to state if not in known countries
    elif len(parts) == 2:
        if parts[1] in known_countries:
            city = parts[0]
            country = parts[1]
            if city in state_abbrev and country == "United States":
                state = state_abbrev[city]
                city = None
            else:
                state = city
                city = None
        else:
            city, state = parts
            if state in us_states.keys():  # If the state name is the abbreviation of the US state
                country = "United States"  # Country name to United States
    elif len(parts) == 3:
        city, state, country = parts
        if state in state_abbrev:  # If state is full name
            state = state_abbrev[state]  # To abbreviation

    return Row(city=city, state=state, country=country)

# 注册UDF
split_location_udf = udf(lambda location: split_location(location), StructType([
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("country", StringType(), True)
]))

# 应用UDF
df_with_location = filtered_df.withColumn("location_split", split_location_udf(col("job_location")))
df_with_location = df_with_location.select("*", "location_split.city", "location_split.state", "location_split.country").drop("location_split")

# 显示过滤后的DataFrame
df_with_location.show()

[Stage 100:>                                                        (0 + 1) / 1]

+--------------------+--------------------+--------------+----------+--------------------+----------------+--------------------+--------------+
|           job_title|        job_location|search_country| job_level|          job_skills|            city|               state|       country|
+--------------------+--------------------+--------------+----------+--------------------+----------------+--------------------+--------------+
|Senior Machine Le...|       New Haven, CT| United States|Mid senior|Machine Learning,...|       New Haven|                  CT| United States|
|Principal Softwar...|   San Francisco, CA| United States|Mid senior|C++, Python, PyTo...|   San Francisco|                  CA| United States|
|Senior ETL Data W...|        New York, NY| United States| Associate|ETL, Data Integra...|        New York|                  NY| United States|
|Senior Data Wareh...|      Harrisburg, PA| United States|Mid senior|Data Lakes, Data ...|      Harrisburg|                  PA| United 

                                                                                

In [15]:
from collections import Counter
# 定义技能规范化函数
def normalize_skill_name(skill):
    skill_normalized = skill.strip().lower()

    if skill_normalized in ("go", "golang"):
        return "Go/Golang"
    if skill_normalized == "ruby on rails":
        return "Ruby"
    if skill_normalized in ("ai", "llms", "artificial intelligence", "generative ai", "natural language processing",
                            "nlp", 'machine learning', 'deep learning', 'ml', 'reinforcement learning'):
        return 'ML/AI'
    if 'problem' in skill_normalized and 'solving' in skill_normalized:
        return 'ProblemSolving'
    if 'microsoft' in skill_normalized and ('office' in skill_normalized or 'excel' in skill_normalized):
        return 'MicrosoftOffice'
    if re.search(r'\b(cloud|azure|aws|gcp)\b', skill_normalized, re.IGNORECASE):
        return 'Cloud Computing'
    if 'communication' in skill_normalized:
        return 'Communication'
    if re.search(r'\b(software|programming|developer)\b', skill_normalized, re.IGNORECASE):
        return 'SoftwareDevelopment'
    if re.search(r'\b(data science|data scientist)\b', skill_normalized, re.IGNORECASE):
        return 'DataScience'
    if re.search(r'\b(data analysis|data analytics|data_analysis)\b', skill_normalized, re.IGNORECASE):
        return 'DataAnalysis'
    if 'java' in skill_normalized and 'script' not in skill_normalized:
        return 'Java'
    if 'javascript' in skill_normalized or 'typescript' in skill_normalized:
        return 'JavaScript/TypeScript'
    if ' ' in skill_normalized:
        return ''.join([word.capitalize() for word in skill_normalized.split()])
    else:
        return skill_normalized.capitalize()

# 注册UDF
normalize_skill_name_udf = udf(normalize_skill_name, StringType())

# Split job_skills into multiple columns
df = df_with_location.withColumn("skills", explode(split(col("job_skills"), ",")))
df = df.withColumn("skills", lower(trim(col("skills"))))

# Apply normalization to each skill
df = df.withColumn("skills_normalized", normalize_skill_name_udf(col("skills")))

# Repartition DataFrame to reduce load on each executor
df = df.repartition(200)

# Cache DataFrame to improve performance
df.cache()

# Count skills and sort by frequency
skills_counter = Counter()
skills_list = df.select("skills_normalized").rdd.flatMap(lambda x: x).collect()
skills_counter.update(skills_list)
all_skills_sorted = sorted(skills_counter.items(), key=lambda x: x[1], reverse=True)
all_skills_sorted_ranked = [(i + 1, skill, count) for i, (skill, count) in enumerate(all_skills_sorted)]

# Define programming languages list
programming_languages = [
    "Python", "C++", "Java", "C#", "JavaScript", "SQL", "Go", "Golang",
    "Scratch", "Visual Basic", "Assembly language", "PHP", "MATLAB",
    "Fortran", "Delphi/Object Pascal", "Swift", "Rust", "Ruby",
    "Kotlin", "COBOL"
]

# Normalize programming languages list
programming_languages_normalized = [normalize_skill_name(lang) for lang in programming_languages]

# Find programming languages not in the top 250
top_250_skills = all_skills_sorted[:250]
languages_not_in_top_250 = [lang for lang in programming_languages_normalized if lang not in [skill for skill, _ in top_250_skills]]

# For each language not in the top 250, find its rank
languages_ranks_not_in_top_250 = [(lang, next((i + 1 for i, (skill, _) in enumerate(all_skills_sorted) if skill == lang), None)) for lang in languages_not_in_top_250]

# Filter out any languages that do not have a rank
languages_ranks_not_in_top_250_filtered = [(skill, rank) for skill, rank in languages_ranks_not_in_top_250 if rank is not None]

# Combine the top skills with the remaining languages that need to be appended
combined_skills = top_250_skills + languages_ranks_not_in_top_250_filtered

# Prepare headers_with_rankings
headers_with_rankings = [f"{i + 1}. {skill}" for i, (skill, _) in enumerate(top_250_skills)]
for lang in languages_not_in_top_250:
    rank = next((i + 1 for i, (skill, _) in enumerate(all_skills_sorted) if skill == lang), None)
    if rank is not None:
        headers_with_rankings.append(f"{rank}. {lang}")

# Add headers_with_rankings to the original columns of df, and remove the job_skills column
for header in headers_with_rankings:
    skill_name = header.split('. ', 1)[1]
    df = df.withColumn(f"`{header}`", when(col("skills_normalized") == skill_name, 1).otherwise(0))

# Define job classification dictionary
categories = {
    'Cloud': ['cloud', 'aws', 'azure', 'google cloud', 'cloud engineer', 'data center', 'cloud architect', 'datacenter'],
    'Testing': ['test', 'tester', 'testing', 'quality assurance', 'qa', 'test engineer'],
    'Development': ['developer', 'development', 'software engineer', 'programmer', 'solution architect', 'database developer',
                    'software', 'software development', "C++", "Java", "C#", "JavaScript/TypeScript", "Go/Golang", "Visual Basic",
                    "Assembly language", "PHP", "Delphi/Object Pascal", "Swift", "Rust", "Ruby",
                    "Kotlin", "COBOL"],
    'Data_science': ['data science', 'machine learning', 'ml', 'deep learning', 'ai', 'artificial intelligence', 'data scientist', 'analytics architect', 'data analytics', 'npl'],
    'Data_analysis': ['data analyst', 'data analysis', 'business intelligence', 'data reporting', 'financial data', 'data warehouse', 'data mining', 'data architect'],
    'Devops': ['devops engineer', 'site reliability', 'sre', 'automation engineer', 'infrastructure as code', 'ci/cd', 'release engineer']
}

# Create new columns for each category
for category in categories:
    df = df.withColumn(category, when(df['job_title'].rlike('|'.join(categories[category])), 1).otherwise(0))

# Check other skill-related columns
for category, keywords in categories.items():
    for keyword in keywords:
        formatted_keyword = ''.join(word.capitalize() for word in keyword.split())
        column_pattern = f". {formatted_keyword}".lower()
        matching_columns = [col_name for col_name in df.columns if col_name.lower().endswith(column_pattern)]
        if matching_columns:
            df = df.withColumn(category, when(col(f"`{matching_columns[0]}`") == 1, 1).otherwise(col(category)))

# Drop rows with NaN values in 'job_skills', 'state', 'country'
df = df.na.drop(subset=['job_skills', 'state', 'country'])

# Drop unnecessary columns
df = df.drop('job_title', 'search_country', 'job_location', 'city', 'job_skills')

# Show data
df.summary()

                                                                                

+----------+-------+--------------+--------------------+--------------------+--------------------+------------------+-----------------+----------+-----------+------------------------+--------+-------------------+----------------------+-----------------+-----------------------+---------------------+-------------+-------+------------------+----------+--------------------+---------------------+-----------+---------------------+--------------+----------------------+--------------------+-----------------------+-------------------+------------+---------------------+---------------+-------------+-----------+----------------+----------------+--------------------------+-----------+-----------------+----------------------+----------------+---------------------+---------+---------------+-------------------+-----------+----------------------+-----------+-----------+---------------------+------------------+----------------+--------------------+-------------+----------+-----------------+------------

NameError: name 'StringIndexer' is not defined