# Fake Job Data Scraping Using Python

### Step 1:Import all the required libraries.

In [1]:
#requests for fetching the raw code from the webpage
import requests

#BeautifulSoup for data scraping
from bs4 import BeautifulSoup

#csv for writing the csv file from the data collected
import csv

### Step 2: Get the URL from the webpage and create a get request using requests.

In [2]:
url='https://realpython.github.io/fake-jobs/'
url_req=requests.get(url).text

### Step 3: Create a soup object by the name fake_job using BeautifulSoup.

In [3]:
fake_job=BeautifulSoup(url_req,'html.parser')

In [4]:
print(fake_job.prettify())

<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   Fake Python
  </title>
  <link href="https://cdn.jsdelivr.net/npm/bulma@0.9.2/css/bulma.min.css" rel="stylesheet"/>
 </head>
 <body>
  <section class="section">
   <div class="container mb-5">
    <h1 class="title is-1">
     Fake Python
    </h1>
    <p class="subtitle is-3">
     Fake Jobs for Your Web Scraping Journey
    </p>
   </div>
   <div class="container">
    <div class="columns is-multiline" id="ResultsContainer">
     <div class="column is-half">
      <div class="card">
       <div class="card-content">
        <div class="media">
         <div class="media-left">
          <figure class="image is-48x48">
           <img alt="Real Python Logo" src="https://files.realpython.com/media/real-python-logo-thumbnail.7f0db70c2ed2.jpg?__no_cf_polish=1"/>
          </figure>
         </div>
         <div class="media-content">
          <h2 c

### Step 4: Extract all the Job Names from the code of the webpage and pass it into a list.

In [5]:
job_list=fake_job.find_all('h2',class_='title is-5')

***The Above step contains the tags and it's attributes along with the name but we only need a name, so we need to extract only names from those filtered tags by using the ".string" method.***

In [6]:
jobs=[]
for job in job_list:
    jobs.append(job.string)
jobs

['Senior Python Developer',
 'Energy engineer',
 'Legal executive',
 'Fitness centre manager',
 'Product manager',
 'Medical technical officer',
 'Physiological scientist',
 'Textile designer',
 'Television floor manager',
 'Waste management officer',
 'Software Engineer (Python)',
 'Interpreter',
 'Architect',
 'Meteorologist',
 'Audiological scientist',
 'English as a second language teacher',
 'Surgeon',
 'Equities trader',
 'Newspaper journalist',
 'Materials engineer',
 'Python Programmer (Entry-Level)',
 'Product/process development scientist',
 'Scientist, research (maths)',
 'Ecologist',
 'Materials engineer',
 'Historic buildings inspector/conservation officer',
 'Data scientist',
 'Psychiatrist',
 'Structural engineer',
 'Immigration officer',
 'Python Programmer (Entry-Level)',
 'Neurosurgeon',
 'Broadcast engineer',
 'Make',
 'Nurse, adult',
 'Air broker',
 'Editor, film/video',
 'Production assistant, radio',
 'Engineer, communications',
 'Sales executive',
 'Software Deve

### Step 5: Extract the company names from the code of the webpage and pass the contents into a list.

In [7]:
job_company=fake_job.find_all('h3',class_='subtitle is-6 company')
company=[]
for companies in job_company:
    company.append(companies.string)
company

['Payne, Roberts and Davis',
 'Vasquez-Davidson',
 'Jackson, Chambers and Levy',
 'Savage-Bradley',
 'Ramirez Inc',
 'Rogers-Yates',
 'Kramer-Klein',
 'Meyers-Johnson',
 'Hughes-Williams',
 'Jones, Williams and Villa',
 'Garcia PLC',
 'Gregory and Sons',
 'Clark, Garcia and Sosa',
 'Bush PLC',
 'Salazar-Meyers',
 'Parker, Murphy and Brooks',
 'Cruz-Brown',
 'Macdonald-Ferguson',
 'Williams, Peterson and Rojas',
 'Smith and Sons',
 'Moss, Duncan and Allen',
 'Gomez-Carroll',
 'Manning, Welch and Herring',
 'Lee, Gutierrez and Brown',
 'Davis, Serrano and Cook',
 'Smith LLC',
 'Thomas Group',
 'Silva-King',
 'Pierce-Long',
 'Walker-Simpson',
 'Cooper and Sons',
 'Donovan, Gonzalez and Figueroa',
 'Morgan, Butler and Bennett',
 'Snyder-Lee',
 'Harris PLC',
 'Washington PLC',
 'Brown, Price and Campbell',
 'Mcgee PLC',
 'Dixon Inc',
 'Thompson, Sheppard and Ward',
 'Adams-Brewer',
 'Schneider-Brady',
 'Gonzales-Frank',
 'Smith-Wong',
 'Pierce-Herrera',
 'Aguilar, Rivera and Quinn',
 'Lowe,

### Step 6: Extract the job location from the code of the webpage and pass the contents into a list.

In [8]:
job_loc=fake_job.find_all('p',class_='location')
location=[]
for loc in job_loc:
    location.append(loc.string)
new_location=[]
for item in location:
    new_location.append(item.strip())
new_location

['Stewartbury, AA',
 'Christopherville, AA',
 'Port Ericaburgh, AA',
 'East Seanview, AP',
 'North Jamieview, AP',
 'Davidville, AP',
 'South Christopher, AE',
 'Port Jonathan, AE',
 'Osbornetown, AE',
 'Scotttown, AP',
 'Ericberg, AE',
 'Ramireztown, AE',
 'Figueroaview, AA',
 'Kelseystad, AA',
 'Williamsburgh, AE',
 'Mitchellburgh, AE',
 'West Jessicabury, AA',
 'Maloneshire, AE',
 'Johnsonton, AA',
 'South Davidtown, AP',
 'Port Sara, AE',
 'Marktown, AA',
 'Laurenland, AE',
 'Lauraton, AP',
 'South Tammyberg, AP',
 'North Brandonville, AP',
 'Port Robertfurt, AA',
 'Burnettbury, AE',
 'Herbertside, AA',
 'Christopherport, AP',
 'West Victor, AE',
 'Port Aaron, AP',
 'Loribury, AA',
 'Angelastad, AP',
 'Larrytown, AE',
 'West Colin, AP',
 'West Stephanie, AP',
 'Laurentown, AP',
 'Wrightberg, AP',
 'Alberttown, AE',
 'Brockburgh, AE',
 'North Jason, AE',
 'Arnoldhaven, AE',
 'Lake Destiny, AP',
 'South Timothyburgh, AP',
 'New Jimmyton, AE',
 'New Lucasbury, AP',
 'Port Cory, AE',
 

### Step 7: Extract Date of job posting from the code of the webpage and pass the contents into a list.

In [9]:
date_time=fake_job.find_all('time')
date=[]
for dates in date_time:
    date.append(dates.string)
date

['2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-08',
 '2021-04-

***NOW TO EXTRACT JOB DESCRIPTION WE NEED TO TRAVERSE THROUGH EACH JOB POSTING PAGE AND EXTRACT THE DESCRIPTION FROM EVERY PAGE.***

# Step 8: Traverse through each page using loops and extract the description.

In [10]:
job_description=[]
for i in range(len(jobs)):
    new=(jobs[i].lower()).replace(' ','-')
    new=new.replace(',','')
    new=new.replace('/','-')
    new=new.replace('(','')
    new=new.replace(')','')
    url=f'https://realpython.github.io/fake-jobs/jobs/{new}-{i}.html'
    url_rq=requests.get(url)
    soup=BeautifulSoup(url_rq.text,'html.parser')
    description=soup.find_all('p',class_='', id='')
    job_description.append(description)
job_description

[[<p>Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset.</p>],
 [<p>Party prevent live. Quickly candidate change although. Together type music hospital. Every speech support time operation wear often.</p>],
 [<p>Administration even relate head color. Staff beyond chair recently and off. Own available buy country store build before. Already against which continue. Look road article quickly. International big employee determine positive go Congress. Level others record 

### Step 9: Pass the main content from the extracted description into final_job_des

In [11]:
final_job_des=[]
for i in range(len(job_description)):
    for item in job_description[i]:
        final_job_des.append(item.string)
final_job_des

['Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset.',
 'Party prevent live. Quickly candidate change although. Together type music hospital. Every speech support time operation wear often.',
 'Administration even relate head color. Staff beyond chair recently and off. Own available buy country store build before. Already against which continue. Look road article quickly. International big employee determine positive go Congress. Level others record hospital employee

***CHECK IF ALL THE CONTENTS IN THE VARIOUS LISTS ARE RELEVANT TO EACH OTHER***

In [12]:
for i in range(len(jobs)):
    print('Job Profile:',jobs[i])
    print('Location:',new_location[i])
    print('Company:',company[i])
    print('Last Date:',date[i])
    print('Job Description:',final_job_des[i])
    print('___________________________________________________________________')

Job Profile: Senior Python Developer
Location: Stewartbury, AA
Company: Payne, Roberts and Davis
Last Date: 2021-04-08
Job Description: Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset.
___________________________________________________________________
Job Profile: Energy engineer
Location: Christopherville, AA
Company: Vasquez-Davidson
Last Date: 2021-04-08
Job Description: Party prevent live. Quickly candidate change although. Together type music hospital. Every 

Location: Port Coryton, AE
Company: Morgan, White and Macdonald
Last Date: 2021-04-08
Job Description: Get poor herself population. Executive will network physical play. Child push it why research wind yard. Return few natural fight stand one significant get. Keep kid assume dinner former heavy.
___________________________________________________________________
Job Profile: Editorial assistant
Location: Amyborough, AA
Company: Robinson-Fitzpatrick
Last Date: 2021-04-08
Job Description: Would truth join issue face key when. City citizen to. List run safe figure. Allow star forward leave art practice.
___________________________________________________________________
Job Profile: Photographer
Location: Reynoldsville, AA
Company: Waters, Wilson and Hoover
Last Date: 2021-04-08
Job Description: Some before according. Around almost amount themselves test. Power one after score career big remember.
___________________________________________________________________
Job Profile: Retail bank

### Step 10: Make a list of the relevant data together as element, i.,e., [Job Name,Company Name,Location,Date Posting,Job Description].

In [13]:
details=[]
for i in range(len(jobs)):
    details.append([jobs[i],company[i],new_location[i],date[i],final_job_des[i]])

### Step 11: Open a .csv file in Write mode.

In [14]:
job_file=open('fake_job.csv','w')

### Step 12: Create a columns field.

In [15]:
fields=['Job Profile','Company','Location','Date', 'Job Description']

### Step 13: Create a writer object for the job_file

In [16]:
job_file_writer=csv.writer(job_file)

### Step 14: Write the Fields first, inorder to create columns

In [17]:
job_file_writer.writerow(fields)

51

### Step 15: Write the data into the rows of the columns created in the Step 14.

In [18]:
job_file_writer.writerows(details)

### Step 16: Close the file and save the notebook.

In [19]:
job_file.close()

 ### Step 17: Open the fake_job.csv file and select date columns and from format cells, change the data type to 'Dates' and select and expand the Job Description column and click wrap text.