In [1]:
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd



In [2]:
def get_job_description(soup):
    div_tag = soup.find('div', class_='job-description-text')

    if div_tag:
        text_inside_div = div_tag.get_text(separator='\n', strip=True)
        return text_inside_div;
    
    return '';

In [3]:
def get_job_tags(soup):
    #p_tag = soup.find(find_tags_string)
    p_tags = soup.find_all('p')

    for p_tag in p_tags:
        h5_span = p_tag.find('span', class_='h5')
        if h5_span:
            span_texts = [span.get_text(strip=True) for span in h5_span.find_all('span')]
            return span_texts;
    
    return span_texts;        

In [4]:
def get_company(soup):
    div_tag = soup.find('div', class_='flex-grow-1 ms-3')

    if div_tag:
        h2_tag = div_tag.find('h2', class_='h5')
        if h2_tag:
            return h2_tag.text;

    return '';

In [5]:
def extract_job_data(url):
    # Send a GET request to the URL
    response = requests.get(url)
    # Parse the HTML content of the page with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract the required data
    data = {}
    data['Job Link'] = url
    
    job_title = soup.find('h1', class_='display-5 mt-4 text-break')
    data['Job Title'] = job_title.text if job_title else None
    
    location = soup.find('h3', class_='lead py-3')
    data['Location'] = location.text if location else None
    
    data['Company Name'] = get_company(soup)
    data['Job Description'] = get_job_description(soup)
    
    salary = soup.find('span', class_='badge rounded-pill text-bg-success my-1')
    data['Salary'] = salary.text if salary else None
    
    employment_type = soup.find('span', class_='badge rounded-pill text-bg-secondary my-1')
    data['Employment Type'] = employment_type.text if employment_type else None
    
    level = soup.find('span', class_='badge rounded-pill text-bg-info my-1')
    data['Level'] = level.text if level else None
    
    data['Job Tags'] = get_job_tags(soup)
    
    return data


In [6]:
cipa = extract_job_data('https://ai-jobs.net/job/79595-openaigpt3-expert-aiml-developer/')
cipa

{'Job Link': 'https://ai-jobs.net/job/79595-openaigpt3-expert-aiml-developer/',
 'Job Title': 'OpenAI/GPT3 Expert / AI/ML Developer',
 'Location': 'Remote',
 'Company Name': 'ModelDiplomat',
 'Job Description': 'ModelDiplomat:\nBuild a prototype for a new AI coach for high school and college students involved in Model United Nations. Model United Nations is a global organization that helps students learn global leadership skills, by researching and representing the points of views of specific countries in conferences that simulate United Nations sessions. After several months of user and market research, we now are looking to build a prototype to get feedback from Beta testers on the product concept.\nWho are we looking for?\nWe are looking for an agile AI Developer who is willing to build a rapid prototype (1-2 months) of this AI tool, so that we can launch the beta in advance of many global Model UN conferences which usually run from January - March. We believe this will be the best 

In [7]:
with open('job_posting_data/job_links.json') as json_file:
    links = json.load(json_file)

json_list = []

for index, link in enumerate(links):
    print(index)
    # if index >= 50:
    #     break
    json_list.append(extract_job_data(link))

df = pd.DataFrame(json_list)
df.to_csv('output.csv', index=True)




0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
