<a href="https://colab.research.google.com/github/ted-M-tech/data-science-1.3M-linkedin-jobs-skills/blob/makoto%2Fissue2/eda2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SET UP

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("asaniczka/1-3m-linkedin-jobs-and-skills-2024")

print("Path to dataset files:", path)

Using Colab cache for faster access to the '1-3m-linkedin-jobs-and-skills-2024' dataset.
Path to dataset files: /kaggle/input/1-3m-linkedin-jobs-and-skills-2024


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import countDistinct, split, explode, trim, lower, regexp_replace, col, length
import os

In [21]:
# 1. Initialize the Session
spark = SparkSession.builder.appName("LinkedInAnalysis").getOrCreate()

# Construct the correct file paths
jobs_file_path = os.path.join(path, "linkedin_job_postings.csv")
skills_file_path = os.path.join(path, "job_skills.csv")
summary_file_path = os.path.join(path, "job_summary.csv")

In [22]:
# 2. Load the datasets
    # header=True to use the first row as column names
    # inferSchema=True so Spark guesses if a column is a Number or a String
df_jobs = spark.read.csv(jobs_file_path, header=True, inferSchema=True)
df_skills = spark.read.csv(skills_file_path, header=True, inferSchema=True)

In [18]:
## Using multiline option as summaries include multiple paragraphs
## Escaping double quotes used in summaries

df_summary = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .option("multiline","true")\
  .option("quote", '"')\
  .option("escape", '\"')\
  .load(summary_file_path)

# Cache df
df_summary.cache()

# Sanity Checks
print("Sanity Check: Total rows are matching - ", df_summary.count() == 1297332) # matching total rows
print("Sanity Check: No unexpected nulls - ", df_summary.count() == df_summary.na.drop().count()) # checking for unexpected nulls

Sanity Check: Total rows are matching -  True
Sanity Check: No unexpected nulls -  True


In [5]:
# 3. Display the Schema
print("--- Jobs Schema ---")
df_jobs.printSchema()

print("--- Skills Schema ---")
df_skills.printSchema()

print("--- Summary Schema ---")
df_summary.printSchema()

--- Jobs Schema ---
root
 |-- job_link: string (nullable = true)
 |-- last_processed_time: string (nullable = true)
 |-- got_summary: string (nullable = true)
 |-- got_ner: string (nullable = true)
 |-- is_being_worked: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- company: string (nullable = true)
 |-- job_location: string (nullable = true)
 |-- first_seen: string (nullable = true)
 |-- search_city: string (nullable = true)
 |-- search_country: string (nullable = true)
 |-- search_position: string (nullable = true)
 |-- job_level: string (nullable = true)
 |-- job_type: string (nullable = true)

--- Skills Schema ---
root
 |-- job_link: string (nullable = true)
 |-- job_skills: string (nullable = true)

--- Summary Schema ---
root
 |-- job_link: string (nullable = true)
 |-- job_summary: string (nullable = true)



In [6]:
# 4. Count Rows and Columns
print(f"Jobs Data: {df_jobs.count()} rows, {len(df_jobs.columns)} columns")
print(f"Skills Data: {df_skills.count()} rows, {len(df_skills.columns)} columns")
print(f"Summary Data: {df_summary.count()} rows, {len(df_summary.columns)} columns")

# 5. Descriptive Statistics
print("--- Jobs Statistics ---")
df_jobs.describe().show()

Jobs Data: 1348488 rows, 14 columns
Skills Data: 1296381 rows, 2 columns
Summary Data: 4107407 rows, 2 columns
--- Jobs Statistics ---
+-------+--------------------+--------------------+--------------------+----------+---------------+--------------------+-----------------+--------------------+-------------+-----------+--------------+----------------+--------------------+---------+
|summary|            job_link| last_processed_time|         got_summary|   got_ner|is_being_worked|           job_title|          company|        job_location|   first_seen|search_city|search_country| search_position|           job_level| job_type|
+-------+--------------------+--------------------+--------------------+----------+---------------+--------------------+-----------------+--------------------+-------------+-----------+--------------+----------------+--------------------+---------+
|  count|             1348488|             1348488|             1348488|   1348488|        1348488|             134848

In [23]:
print('--- Job Postings ---')
df_jobs.show(5,0)
print('\n--- Job Skills ---')
df_skills.show(5,0)
print('\n--- Job Summary ---')
df_summary.show(5,0)

--- Job Postings ---
+------------------------------------------------------------------------------------------------------------------------------+-----------------------------+-----------+-------+---------------+--------------------------------------------------------------------------+----------------------------+--------------------+----------+-----------+--------------+------------------------------------+----------+--------+
|job_link                                                                                                                      |last_processed_time          |got_summary|got_ner|is_being_worked|job_title                                                                 |company                     |job_location        |first_seen|search_city|search_country|search_position                     |job_level |job_type|
+------------------------------------------------------------------------------------------------------------------------------+---------------------

In [20]:
df_summary.show(30)

+--------------------+--------------------+
|            job_link|         job_summary|
+--------------------+--------------------+
|https://www.linke...|Rock N Roll Sushi...|
|https://www.linke...|Schedule\n: PRN i...|
|https://www.linke...|Description\nIntr...|
|https://uk.linked...|Commercial accoun...|
|https://www.linke...|Address:\nUSA-CT-...|
|https://www.linke...|Description\nOur\...|
|https://www.linke...|Company Descripti...|
|https://uk.linked...|An exciting oppor...|
|https://www.linke...|Job Details:\nJob...|
|https://www.linke...|Our\nRestaurant T...|
|https://www.linke...|Our General Manag...|
|https://www.linke...|Earning potential...|
|https://www.linke...|Dollar General Co...|
|https://au.linked...|Restaurant Descri...|
|https://au.linked...|Who We Are\nWe ar...|
|https://www.linke...|A Place Where Peo...|
|https://www.linke...|Description\nThe ...|
|https://www.linke...|Overview\nDescrip...|
|https://www.linke...|Description\nThe ...|
|https://www.linke...|Laboratory

# CLEANING

In [9]:
# 1. Drop duplicates based on the unique 'job_link'
df_jobs_clean = df_jobs.dropDuplicates(['job_link'])

# 2. Drop rows where critical columns are missing
df_jobs_clean = df_jobs_clean.dropna(subset=['job_title', 'job_link'])

print(f"Original Count: {df_jobs.count()}")
print(f"Cleaned Count: {df_jobs_clean.count()}")

Original Count: 1348488
Cleaned Count: 1348463


Jobs Data Cleaning

In [10]:
# 1. Clean Job Titles
    # We remove any character that is NOT a letter, number, space, or standard punctuation
df_jobs_deep_clean = df_jobs_clean.withColumn(
    "job_title_clean",
    regexp_replace(col("job_title"), r"[^a-zA-Z0-9\s\-\/\&]", "")
)

# 2. Clean Company Names
    # Filter out "companies" that are just numbers or extremely short
df_jobs_deep_clean = df_jobs_deep_clean.filter(
    (col("company").isNotNull())
)

# Compare
df_jobs_deep_clean.select("job_title", "job_title_clean", "company").show(5, truncate=False)

+---------------------------+---------------------------+---------------------------------------+
|job_title                  |job_title_clean            |company                                |
+---------------------------+---------------------------+---------------------------------------+
|United Kingdom             |United Kingdom             |Electrician Supervisor Substation      |
|United States              |United States              |Art Director                           |
|Duty Engineer              |Duty Engineer              |Arjaan Hotel Apartments by Rotana      |
|Entertainment Manager - F&B|Entertainment Manager - F&B|Apt Resources | Recruitment Specialists|
|EVS Operator               |EVS Operator               |Sundus                                 |
+---------------------------+---------------------------+---------------------------------------+
only showing top 5 rows


Skills Data Cleaning

In [11]:
# 1. Split the long string into an List based on commas
    # "Skill A, Skill B" -> ["Skill A", "Skill B"]
df_skills_array = df_skills.withColumn("skills_array", split(col("job_skills"), ","))

# 2. EXPLODE the array
    # This creates a new row for EVERY skill in the list.
df_skills_exploded = df_skills_array.select(
    col("job_link"),
    explode(col("skills_array")).alias("skill_raw")
)

# 3. Clean the individual skills
    # Remove leading spaces, convert to lowercase for consistency
df_skills_final = df_skills_exploded.withColumn("skill", trim(lower(col("skill_raw")))) \
                                    .filter(col("skill") != "") # Remove empty strings

# Check the difference
print(f"Original Skills Rows: {df_skills.count()}")
print(f"Deep Cleaned Skills Rows: {df_skills_final.count()}")
df_skills_final.show(10, truncate=False)

Original Skills Rows: 1296381
Deep Cleaned Skills Rows: 26908836
+-----------------------------------------------------------------------------------------------+---------------------------+---------------------------+
|job_link                                                                                       |skill_raw                  |skill                      |
+-----------------------------------------------------------------------------------------------+---------------------------+---------------------------+
|https://www.linkedin.com/jobs/view/housekeeper-i-pt-at-jacksonville-state-university-3802280436|Building Custodial Services|building custodial services|
|https://www.linkedin.com/jobs/view/housekeeper-i-pt-at-jacksonville-state-university-3802280436| Cleaning                  |cleaning                   |
|https://www.linkedin.com/jobs/view/housekeeper-i-pt-at-jacksonville-state-university-3802280436| Janitorial Services       |janitorial services        |
|https://ww

Summary Data Cleaning

In [12]:
# Remove HTML tags using a regex pattern, then remove extra whitespace
df_summary_clean = df_summary.withColumn(
    "job_summary_clean",
    regexp_replace(col("job_summary"), r"<[^>]+>", "")
).withColumn(
    "job_summary_clean",
    regexp_replace(col("job_summary_clean"), r"\s+", " ")
)

df_summary_clean.select("job_link","job_summary_clean").show(10,0)

+--------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Deeper analysis

In [13]:
# Define which columns we want to check
columns_to_check = ['job_title', 'company', 'job_location', 'search_country', 'job_level']

print("--- Unique Value Counts (Cardinality) ---")

for column in columns_to_check:
    unique_count = df_jobs_clean.select(countDistinct(column)).collect()[0][0]
    print(f"Unique {column}s: {unique_count}")

# Check unique skills (from the skills dataframe)
unique_skills = df_skills.select(countDistinct("job_skills")).collect()[0][0]
print(f"\nUnique Skills found: {unique_skills}")

--- Unique Value Counts (Cardinality) ---
Unique job_titles: 584538
Unique companys: 90630
Unique job_locations: 29172
Unique search_countrys: 26
Unique job_levels: 23

Unique Skills found: 1287097


## Deep Dive into "Top Values"

In [14]:
print("--- Top 10 Companies Posting Jobs ---")
df_jobs_clean.groupBy("company") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(10, truncate=False)

print("\n--- Top 10 Job Titles ---")
df_jobs_clean.groupBy("job_title") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(10, truncate=False)

print("\n--- Top 10 Skills Requested ---")
df_skills.groupBy("job_skills") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(10, truncate=False)

--- Top 10 Companies Posting Jobs ---
+----------------------+-----+
|company               |count|
+----------------------+-----+
|Health eCareers       |41597|
|Jobs for Humanity     |27680|
|TravelNurseSource     |16142|
|Dollar General        |14815|
|PracticeLink          |9737 |
|Energy Jobline        |9364 |
|Gotham Enterprises Ltd|8935 |
|Jobot                 |8713 |
|ClearanceJobs         |8599 |
|McDonald's            |8125 |
+----------------------+-----+
only showing top 10 rows

--- Top 10 Job Titles ---
+-------------------------------+-----+
|job_title                      |count|
+-------------------------------+-----+
|LEAD SALES ASSOCIATE-FT        |7325 |
|Shift Manager                  |5818 |
|First Year Tax Professional    |5356 |
|Assistant Manager              |5346 |
|Customer Service Representative|5203 |
|LEAD SALES ASSOCIATE-PT        |4924 |
|Store Manager                  |4791 |
|CUSTOMER SERVICE REPRESENTATIVE|4218 |
|Registered Nurse               |419