In [4]:
import requests
import math
from bs4 import BeautifulSoup
import pandas as pd
import json
import re

### Extracting Locations using BeautifulSoup

In [4]:
url = 'https://www.themuse.com/developers/api/v2'
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'html.parser')

In [3]:
# Extracting all locations from the wesbite
location_html = soup.find(class_='form-group expansion',attrs={"data-argument-name":"location"})
all_locations=[]
for loc in location_html.find_all('label'):
    all_locations.append(loc.text)

In [4]:
# Filtering locations in USA using regex
pattern=re.compile('.*, [A-Z]{2}$')
usa_locations=list(filter(pattern.match,all_locations))
usa_locations

['Aberdeen, MD',
 'Aberdeen, SD',
 'Aberdeen, WA',
 'Abilene, TX',
 'Accord, NY',
 'Acton, CA',
 'Ada, OK',
 'Addison, IL',
 'Addison, TX',
 'Adelanto, CA',
 'Adrian, MI',
 'Advance, NC',
 'Agawam, MA',
 'Agoura Hills, CA',
 'Ahwatukee, AZ',
 'Aiken, SC',
 'Akron, OH',
 'Alameda, CA',
 'Alamogordo, NM',
 'Albany, GA',
 'Albany, NY',
 'Albany, OR',
 'Albemarle, NC',
 'Albert Lea, MN',
 'Albuquerque, NM',
 'Alcoa, TN',
 'Alexandria, LA',
 'Alexandria, VA',
 'Alice, TX',
 'Allen Park, MI',
 'Allentown, PA',
 'Allen, TX',
 'Alliance, OH',
 'Alma, MI',
 'Alpena, MI',
 'Alpharetta, GA',
 'Altamont, OR',
 'Alton, IL',
 'Altoona, PA',
 'Altus, OK',
 'Amarillo, TX',
 'Amboy, IL',
 'American Canyon, CA',
 'American Fork, UT',
 'Americus, GA',
 'Ames, IA',
 'Amherst, MA',
 'Amsterdam, NY',
 'Anaheim, CA',
 'Anchorage, AK',
 'Anderson, IN',
 'Anderson, SC',
 'Andover, CT',
 'Andover, MA',
 'Angleton, TX',
 'Anna Maria, FL',
 'Annapolis Junction, MD',
 'Annapolis, MD',
 'Ann Arbor, MI',
 'Ansonia, 

In [5]:
# Storing all the USA locations extracted into file
def write_locations(locations):
    print("Started writing list data into a json file")
    with open("USA_Locations.json", "w") as fp:
        json.dump(locations, fp)
        print("Done writing JSON data into .json file")
write_locations(usa_locations)

Started writing list data into a json file
Done writing JSON data into .json file


In [5]:
df=pd.read_json('USA_Locations.json')
usa_locations=(df[0]).to_list()

In [4]:
def func(item):
    lst=item.split(',')
    state=lst[1]
    return state.strip()
states=[]
for loc in usa_locations:
    state=func(loc)
    if state not in states:
        states.append(state)
states

['MD',
 'SD',
 'WA',
 'TX',
 'NY',
 'CA',
 'OK',
 'IL',
 'MI',
 'NC',
 'MA',
 'AZ',
 'SC',
 'OH',
 'NM',
 'GA',
 'OR',
 'MN',
 'TN',
 'LA',
 'VA',
 'PA',
 'UT',
 'IA',
 'AK',
 'IN',
 'CT',
 'FL',
 'WI',
 'AL',
 'NJ',
 'CO',
 'ME',
 'RI',
 'NH',
 'NE',
 'AR',
 'DE',
 'MT',
 'MS',
 'ND',
 'ID',
 'NV',
 'KY',
 'MO',
 'VT',
 'ON',
 'WY',
 'WV',
 'KS',
 'PR',
 'HI',
 'BC',
 'SK',
 'DC']

In [5]:
actual_states = [ 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
           'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
           'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
           'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
           'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY']
count=0
for state in states:
    if state not in actual_states:
        print(state)
    else:
        count+=1


ON
PR
BC
SK


### Extracting job details

In [46]:
# url = 'https://www.themuse.com/api/public/jobs'
# headers = {"Content-type": "application/json"}


# data=[]
# params = {'category':'Data and Analytics','page': '1', 'descending': 'True'}
# resp = requests.get(url, params=params, headers=headers ).json()
# page_count=resp['page_count']
# for page in range(0,99):
#     params = {'category':'Data and Analytics', 'page': page, 'descending': 'True'}
#     resp = requests.get(url, params=params, headers=headers).json()
#     try:
#         #for num in range(0,len(resp['results'])):
#         data.extend(resp['results'])
#         with open("Result_Data.json", "w") as fp:
#             json.dump(data, fp)
#     except Exception as e:
#             print(e)

In [1]:
def muse_api_call(category, location, sort_order="d", page=0):
    url = 'https://www.themuse.com/api/public/jobs'
    headers = {"Content-type": "application/json"}

    if sort_order in ["d", "desc", "descending"]: sort_value = "true"
    else: sort_value = "false"

    params = {'category':category, 'location': location, 'page': page, 'descending': sort_value}
    resp = requests.get(url, params=params, headers=headers).json()

    return resp

In [2]:
def get_all_muse_pages(category, location):
    results = {}
    loc_lmt = 600
    
    while loc_lmt-600 < len(location):
        resp = muse_api_call(category, location[loc_lmt-600:loc_lmt])
        pages = resp['page_count']
        for page in range(math.ceil(pages/2)):
            order = ['d', 'a']
            for ele in order:
                resp = muse_api_call(category, location[loc_lmt-600:loc_lmt], ele, page)
                for num in range(len(resp['results'])):
                    results[resp['results'][num]['id']] = resp['results'][num]

        loc_lmt += 600

    return results

In [6]:
data_analytics_results = get_all_muse_pages('Data and Analytics', usa_locations)

In [8]:
data_science_results = get_all_muse_pages('Data Science', usa_locations)

In [7]:
with open('data_analytics_results.json', 'w') as new_file:
    new_file.write(json.dumps(data_analytics_results))

In [1]:
with open('data_science_results.json', 'w') as new_file:
    new_file.write(json.dumps(data_science_results))

NameError: name 'json' is not defined

- swathi