In [None]:
from bs4 import BeautifulSoup
import pandas as pd

<h2 align='center'>Parsing the raw HTML<h2>

In [2]:
with open("job_listings.html") as job_listings:
    page = BeautifulSoup(job_listings, 'html.parser')
page

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Job Listings</title>
<style>
      body {
        font-family: Arial, sans-serif;
        margin: 20px;
      }
      .job-listing {
        border: 1px solid #ccc;
        padding: 15px;
        margin-bottom: 10px;
        border-radius: 5px;
      }
      .job-title {
        font-size: 1.2em;
        font-weight: bold;
      }
      .company,
      .location,
      .date,
      .skills {
        margin: 5px 0;
      }
    </style>
</head>
<body>
<h1>Available Job Listings</h1>
<div class="job-listing">
<div class="job-title">Backend Developer</div>
<div class="company">Company: NextGen</div>
<div class="location">Location: Lahore, Pakistan</div>
<div class="date">Posted: 2025-07-30</div>
<div class="skills">Skills: SQL, Excel, PowerBI</div>
</div>
<div class="job-listing">
<div class="job-title">UI/UX Designer</div>
<div class="company">Company: NextGen</div>
<div class="location">Location: Lahore, Pakistan</di

<h2 align='center'>Scraping the HTML file<h2>

In [3]:
job_list = page.find_all(class_ = 'job-listing')
job_list

[<div class="job-listing">
 <div class="job-title">Backend Developer</div>
 <div class="company">Company: NextGen</div>
 <div class="location">Location: Lahore, Pakistan</div>
 <div class="date">Posted: 2025-07-30</div>
 <div class="skills">Skills: SQL, Excel, PowerBI</div>
 </div>,
 <div class="job-listing">
 <div class="job-title">UI/UX Designer</div>
 <div class="company">Company: NextGen</div>
 <div class="location">Location: Lahore, Pakistan</div>
 <div class="date">Posted: 2025-07-29</div>
 <div class="skills">Skills: Flutter, Dart</div>
 </div>,
 <div class="job-listing">
 <div class="job-title">Frontend Developer</div>
 <div class="company">Company: SoftVision</div>
 <div class="location">Location: Karachi, Pakistan</div>
 <div class="date">Posted: 2025-07-28</div>
 <div class="skills">Skills: Selenium, JMeter</div>
 </div>,
 <div class="job-listing">
 <div class="job-title">Data Analyst</div>
 <div class="company">Company: NextGen</div>
 <div class="location">Location: Karachi

<h3 align='center'>Extracting Fields<h3>

In [12]:
all_fields = []
for job in job_list:
    fields = []
    for field in job:
        if field == '\n':
            continue
        text = field.text.strip()
        class_name = field.get("class")[0]
        if class_name == 'job-title':
            fields.append(text)
        else:
            fields.append(text.split(': ')[-1])
    all_fields.append(fields)
all_fields

[['Backend Developer',
  'NextGen',
  'Lahore, Pakistan',
  '2025-07-30',
  'SQL, Excel, PowerBI'],
 ['UI/UX Designer',
  'NextGen',
  'Lahore, Pakistan',
  '2025-07-29',
  'Flutter, Dart'],
 ['Frontend Developer',
  'SoftVision',
  'Karachi, Pakistan',
  '2025-07-28',
  'Selenium, JMeter'],
 ['Data Analyst',
  'NextGen',
  'Karachi, Pakistan',
  '2025-07-25',
  'Selenium, JMeter'],
 ['Data Analyst',
  'DevMasters',
  'Karachi, Pakistan',
  '2025-07-23',
  'Figma, Adobe XD, Sketch'],
 ['QA Engineer',
  'DataInsights',
  'Lahore, Pakistan',
  '2025-07-26',
  'AWS, Docker, Kubernetes, CI/CD'],
 ['Machine Learning Engineer',
  'BetaSoft',
  'Karachi, Pakistan',
  '2025-07-31',
  'HTML, CSS, JavaScript, React'],
 ['Software Engineer',
  'DevMasters',
  'Peshawar, Pakistan',
  '2025-07-25',
  'HTML, CSS, JavaScript, React'],
 ['DevOps Engineer',
  'BetaSoft',
  'Lahore, Pakistan',
  '2025-07-27',
  'Figma, Adobe XD, Sketch'],
 ['Full Stack Developer',
  'DevMasters',
  'Lahore, Pakistan',
 

In [6]:
df = pd.DataFrame(columns=['Job title', 'Company', 'Location', 'Posted date', 'Skills'], data = all_fields)

<h2 align='center'>Text Cleaning<h2>

### Splitting Location into City and Country

In [7]:
df[['City', 'Country']] = df['Location'].str.split(', ', expand=True)

### Dropping the location column

In [9]:
df.drop(columns=['Location'], inplace=True)

### Splitting Skills column into list

In [10]:
df['Skills'] = df['Skills'].apply(lambda x: [skill for skill in x.split(', ')])

In [11]:
df.to_csv('job_listings.csv', index=False)