Our project will summarize the job market and help users best position themselves. We will extract key qualifications and desired skills from job posting descriptions to both identify jobs that a user is qualified for, and also aggregate what each industry is looking for as a whole. We also hope to extract some information from the top companies in a field, and hopefully match some of this information to a user's data to generate cover letters or other insights.

In [None]:
import requests
import pandas as pd
from google.colab import files
from datetime import datetime
from IPython.display import display

In [None]:
#LinkedIn OAuth 2.0 Access Token AQXbt06hTvIxoWRoO6yB__RML-IlRhxBnk31svPFy-QxFpYm2GyXUTs0jiwb39RELP9vTIBIAyuFs4MKF-vPyEGXLOFer_-fT1VJ002dA_e-QAz1YgG08zg8vl9uYuyJO3UZ5tAwN7bUXzrHTMxBKkFPYNpWliSWuAa0OnVHvZek4WW8K1veINY5tx9UCKeNV18VgjnvH0qRwtUEAjQCjFoKuywI-EbkOZbhalLjHZqqxo132IAahB5_0XeB--S9v0uEhHdVlCuMWstvjN4IOx88g0oSnCTWr2HZVm1A7GM82piI6j3SjKW59G86S9mr4rv9CFSr2QmhYDGj2TG9eGLzjYSipQ
#turns out only things you can do are post/comment/like, and get your own email/name/location/image
#https://www.linkedin.com/developers/tools/oauth/token-inspector?clientId=86g6obttwtb0k5
#https://www.jcchouinard.com/linkedin-api/

In [None]:
def getVal(d, ks): #can test with d = {1:{2:{3:{4:{5:6}}}}}, ks = [0,3]
  d2 = d
  for i, k in enumerate(ks):
    if i == len(ks) - 1:
      return d2.get(k, "N/A")
    d2 = d2.get(k, {})

def unqFileName(base):
  now = datetime.now()
  # dd/mm/YY H:M:S
  dt_string = now.strftime("%d%m%Y_%H%M%S")
  return base + "_" + dt_string + ".csv"

def downloadOrDisplay(file, df, head=False):  
  download = ""
  while download != "Y" and download != "N":
    download = input("download? [y/n] --> ").upper()
  if download == "Y":
    fNm = unqFileName(file)
    df.to_csv(fNm, encoding='utf-8') 
    files.download(fNm)
  else:
    if head:
      display(df.head())
    else:
      display(df)

In [None]:
def getAdzunaJobs(search_field, country_code, n):                                 #returns a pandas dataframe of n search_field jobs from country_code
  #documentation: https://developer.adzuna.com/overview
  len = 0
  jobs = {}
  for field in ['title', 'cat', 'comp', 'loc', 'desc', 'redURL', 'lon', 'lat', 'pstDate', 'salaryPredicted']:
    jobs[field] = [] #if using user picked filtration, can possibly keep track of which ones each job passes/fails and aggregate
  params = {
    'app_id': 'e25d9d98',
    'app_key': 'd3308f10a71b2ea8b049eafd60834da0',
    'what': search_field
  }
  pg = 1
  while (True):
    url = 'https://api.adzuna.com/v1/api/jobs/'+ country_code +'/search/' + str(pg)
    resp = requests.get(url,params=params)
    for res in resp.json()['results']:
      jobs['title'].append(getVal(res, ['title']))
      jobs['cat'].append(getVal(res, ['category', 'label']))
      comp = getVal(res, ['company', 'display_name'])
      jobs['comp'].append(comp)  
      loc = getVal(res, ['location', 'area'])
      if loc == "N/A":
        loc = [getVal(res, ['display_name'])]
      jobs['loc'].append(     ", ".join(reversed(loc))     )
      desc = getVal(res, ['description']) #lots of potention for cleaning this up, extracting keywords, find most important parts, use thesaurus APIs, ...
      for tag in ["Summary: ", "Job Description ", "ROLE DESCRIPTION ", "Application Instructions ", "Position Purpose: "]:
        desc = desc.replace(tag, "") 
      # if desc.find(comp) == 0:
      #   desc = desc.replace(comp, "", 1)
      jobs['desc'].append(desc)
      jobs['redURL'].append(getVal(res, ['redirect_url']))
      jobs['lon'].append(getVal(res, ['longitude']))
      jobs['lat'].append(getVal(res, ['latitude'])) #can send to google maps, compare to user location
      pstDate = getVal(res, ['created']).split("T")[0]
      jobs['pstDate'].append(pstDate)
      jobs['salaryPredicted'].append(getVal(res, ['salary_is_predicted']))
      len += 1
      if (len == n):
        return pd.DataFrame.from_dict(jobs)
    pg += 1

In [None]:
def getAdzunaComps(search_field, country_code):                                   #returns a pandas dataframe of the top search_field companies in country_code, 5 generally, + specific locations that can be hardcoded
#TODO : i don't think i can get more than 5 for national, but i can get different locations
#documentation: https://developer.adzuna.com/overview
  len = 0
  comps = {}
  for field in ['company', 'count', 'location']:
    comps[field] = []

  locs = [[], ['US', 'Georgia', 'Madison County'],['US', 'Colorado', 'Denver'], ['US', 'California', 'San Diego County']]   #looks like if specified location, it might give between 1-5
  for loc in locs:
    locStr = ", ".join(reversed(loc))
    params = {
      'app_id': 'e25d9d98',
      'app_key': 'd3308f10a71b2ea8b049eafd60834da0',
      'what': search_field
    }
    for i, loci in enumerate(loc):
      params['location'+str(i)] = loci #location0 = loc[0]
    url = 'https://api.adzuna.com/v1/api/jobs/'+ country_code +'/top_companies/'
    resp = requests.get(url,params=params).json()['leaderboard']
    for item in resp:
      comps['company'].append(getVal(item, ['canonical_name']))
      comps['count'].append(getVal(item, ['count']))
      comps['location'].append(locStr)
  return pd.DataFrame.from_dict(comps)

In [None]:
adzJobsDf = getAdzunaJobs("data science", "us", 10) #note 50 results takes 4-8 seconds, 200 results takes 20-40 seconds

In [None]:
downloadOrDisplay("adzunaJobs", adzJobsDf, head=True)

download? [y/n] --> n


Unnamed: 0,title,cat,comp,loc,desc,redURL,lon,lat,pstDate,salaryPredicted
0,Data Science Manager,IT Jobs,Takeda Pharmaceutical,"Winder, Barrow County, Georgia, US","By clicking the “Apply” button, I understand t...",https://www.adzuna.com/land/ad/3562281429?se=7...,-83.720171,33.99261,2022-10-06,0
1,Data Science Manager,IT Jobs,Takeda Pharmaceutical,"Madison County, Georgia, US","By clicking the “Apply” button, I understand t...",https://www.adzuna.com/land/ad/3562281884?se=7...,-83.196478,34.076123,2022-10-06,0
2,Data Science Manager,IT Jobs,Takeda Pharmaceutical,"Oxford, Newton County, Georgia, US","By clicking the “Apply” button, I understand t...",https://www.adzuna.com/land/ad/3562282016?se=7...,-83.867405,33.619003,2022-10-06,0
3,Data Science Manager,IT Jobs,Takeda Pharmaceutical,"Social Circle, Walton County, Georgia, US","By clicking the “Apply” button, I understand t...",https://www.adzuna.com/land/ad/3562281392?se=7...,-83.718231,33.656228,2022-10-06,0
4,Data Science Manager,IT Jobs,Takeda Pharmaceutical,"Lawrenceville, Gwinnett County, Georgia, US","By clicking the “Apply” button, I understand t...",https://www.adzuna.com/land/ad/3562281947?se=7...,-84.013,34.004652,2022-10-06,0


In [None]:
adzCompsDf = getAdzunaComps("data science", "us")

In [None]:
downloadOrDisplay("adzunaComps", adzCompsDf)

download? [y/n] --> n


Unnamed: 0,company,count,location
0,Deloitte,6821,
1,IQVIA,3771,
2,PwC,1853,
3,UnitedHealth Group,1825,
4,KPMG,1794,
5,Takeda Pharmaceutical,1,"Madison County, Georgia, US"
6,Deloitte,64,"Denver, Colorado, US"
7,PwC,52,"Denver, Colorado, US"
8,KPMG,38,"Denver, Colorado, US"
9,salesforce.com,30,"Denver, Colorado, US"


In [None]:
from bs4 import BeautifulSoup
import json
import pandas as pd
import requests

def theMuseScrape():
  url = "https://www.themuse.com/api/public/jobs"

  params = {
      
      'page': '1',
      'level': 'Internship'
        
  }
  
  response = requests.get(url,params = params)
  
  print(response)

  content = json.loads(response.content.decode('utf-8'))

  results = content['results']
  
  table = []
  label = []

  for key in results[0].items():
    label.append(key[0])
  
  table.append(label)

  for result in results:
    temp = []
    for key,value in result.items():
      if(key == 'company'):
        temp.append(value['name'])
      elif(key == 'categories'):
        temp.append(value[0]['name'])
      else:
        temp.append(value)
    
    table.append(temp)

  panda = pd.DataFrame(table)

  panda.columns = panda.iloc[0]
  panda = panda.drop(0)
  
  
  print(panda[['name','publication_date','categories','company']])

  
  

theMuseScrape()

<Response [200]>
0                                                name      publication_date  \
1   DoD SkillBridge Internship - Client Service Ma...  2022-10-17T23:57:57Z   
2                         US Cybersecurity Internship  2022-10-08T23:43:11Z   
3                        Intern - Sales Administrator  2022-10-17T23:58:56Z   
4                     Admin Support Intern (Surabaya)  2022-11-03T23:37:02Z   
5                      Compensation & Benefits Intern  2022-10-17T23:56:47Z   
6                         Customer Finance Internship  2022-10-15T23:34:30Z   
7       Quality Control Intern (Education & Training)  2022-10-17T23:57:56Z   
8     FY23 Campus DI MR Scientific Intern  (北京/广州/成都)  2022-10-11T23:47:12Z   
9   FY23 Campus (Varian) Clinical Training Intern ...  2022-09-17T00:09:58Z   
10  DoD SkillBridge Internship - Project Managemen...  2022-09-01T23:43:31Z   
11  Operations Leadership Development Program Inte...  2022-10-24T23:36:20Z   
12  DOD SkillBridge Intern - Wareho