In [691]:
import requests
import re
from bs4 import BeautifulSoup
import time
import pandas as pd
import sys
import os
import json

In [470]:
mysession=requests.Session()
html_text=mysession.get('https://www.onetonline.org/link/summary/11-3051.00').text
soup = BeautifulSoup(html_text, 'html.parser') 

In [883]:
class HTMLparser:
    def __init__(self, HTMLdoc):
        self.root = BeautifulSoup(HTMLdoc, 'html.parser')
        for t in self.root(['script', 'style', 'meta', 'link', 'head', 'img']):
            t.extract()
        self.headSection = self.root.find('div', {'id': 'content'})
        self.types = {'A': ['Tasks',
                            'Knowledge',
                            'Skills',
                            'Abilities',
                            'WorkActivities',
                            'DetailedWorkActivities',
                            'WorkContext',
                            'WorkStyles'],
                      'B': ['ToolsTechnology'],
                      'C': ['Job Zone','Wages & Employment Trends'],
                      'D': ['Education'],
                      'E': ['Interests', 'Work Values'],
                      'F': ['Occupation listing']
                      }
                
    def display(self, *tag):
        if not tag:
            tag = self.root
        print(tag.prettify())
        
    def head(self):
        headTag = self.headSection.span.text
        return headTag.split('-')[2].strip()
    
    def jobID(self):
        headTag = self.headSection.span.text
        return headTag.split()[0].strip()
    
    def description(self):
        return self.headSection.find_all('p')[0].text
    
    def titles(self):
        return [i.strip() for i in re.findall('([a-zA-Z0-9 ]+),', self.headSection.find_all('p')[1].text)]
    
    def dictParser(self, text, convertList=False):
        newDict = {}
        text = re.sub('[^\x00-\x7F]', '', text)
        text = re.sub('[\"\']', '', text)
        for i in text.split('\n'):
            if i:
                try:
                    newPair = i.split('â€”')
                    newDict[newPair[0].strip()] = newPair[1].strip()
                except IndexError:
                    newDict[newPair[0].strip()] = None
        if convertList and not any(newDict.values()):
            newDict = list(newDict)
        return newDict
    
    def tableParser(self, trs):
        newDict = {}
        for tr in trs:
            tds = tr.find_all('td')
            try:
                key = re.sub('[^\x00-\x7F]', '', tds[0].text) 
                key = text = re.sub('[\"\']', '', key)
            except IndexError:
                continue
            if key:
                try: 
                    value = re.sub('[^\x00-\x7F]', '', tds[1].text) 
                    value = re.sub('[\"\']', '', value)
                    newDict[key.strip()] = value.strip()        
                except IndexError:
                    value = None
                    newDict[key.strip()] = value  
        return newDict
      
    def typeAParser(self, section):  # list 1 section
        try:
            headTag = self.headSection.find('div', {'class': 'section_' + section}, recursive=False)
            return self.dictParser(headTag.find_all('ul')[0].text)
        except AttributeError:
            return None   
    
    def typeBParser(self, section):  # list 2 sections
        try:
            headTag = self.headSection.find('div', {'class': 'section_' + section}, recursive=False)
            uls = headTag.find_all('ul')
            for ul in uls:
                yield self.dictParser(ul.text)
            yield None
        except AttributeError:
            for i in range(3):
                yield None
    
    def typeCParser(self, section):  # table no headline
        try:
            headTag = self.headSection.find('table', 
                                            {'summary': section + ' information for this occupation'}, 
                                            recursive=True)
            tableItem = headTag.find_all('tr')
            return self.tableParser(tableItem)
        except AttributeError:
            return None
        
    def typeDParser(self, section): # table with headline
        try:
            headTag = self.headSection.find('table', 
                                            {'summary': section + ' information for this occupation'}, 
                                            recursive=True)
            tableItem = headTag.find_all('tr')[1:]
            return self.tableParser(tableItem)
        except AttributeError:
            return None
        
    def typeEParser(self, section):  # find in next closet siblings
        try:
            nextNode = self.headSection.find('h3', text=section, recursive=False)
            while True:
                nextNode = nextNode.nextSibling
                if nextNode.name == 'ul':
                    return self.dictParser(nextNode.text)
        except AttributeError:
            return None
            
    def typeFParser(self, section):  # table no headline
        try:
            headTag = self.headSection.find('table', 
                                            {'summary': section}, 
                                            recursive=True)
            tableItem = headTag.find_all('tr')
            return self.tableParser(tableItem)
        except AttributeError:
            return None
            
    def parse(self):
        self.features = {}
        self.features['id'] = self.jobID()
        self.features['name'] = self.head()
        self.features['description'] = self.description()
        self.features['titles'] = self.titles()
        for oneType, sections in self.types.items():
            if oneType == 'B':
                try:
                    self.features['Tools'], self.features['Technology'] = list(self.typeBParser(sections[0]))[0:2]
                except AttributeError:
                    self.features['Tools'], self.features['Technology'] = (None, None)
            else:
                for oneSection in sections:
                    try:
                        self.features[oneSection] = getattr(self, 'type' + oneType + 'Parser')(oneSection)
                    except AttributeError:
                        self.features[oneSection] = None
        return self.features
        

In [884]:
def readCodes(fname='ONET_code.csv'):
    df = pd.read_csv(fname, header=None)
    return df[1]

df = readCodes()

def readDescrip(df):
    datalist = []
    session = requests.Session()
    mainUrl = 'https://www.onetonline.org/link/summary/'
    for code in df:
        print(code)
        time.sleep(0.1)
        codehtml = session.get(mainUrl + str(code))
        datalist.append(HTMLparser(codehtml.text).parse())
    alljobs = pd.DataFrame(datalist)
    alljobs.to_csv('jobdescription.csv')
        
readDescrip(df)
        

11-1011.00
11-1011.03
11-1021.00
11-2011.00
11-2021.00
11-2022.00
11-2031.00
11-3011.00
11-3021.00
11-3031.01
11-3031.02
11-3051.00
11-3051.01
11-3051.02
11-3051.03
11-3051.04
11-3051.06
11-3061.00
11-3071.01
11-3071.02
11-3071.03
11-3111.00
11-3121.00
11-3131.00
11-9013.01
11-9013.02
11-9013.03
11-9021.00
11-9031.00
11-9032.00
11-9033.00
11-9039.01
11-9039.02
11-9041.00
11-9041.01
11-9051.00
11-9061.00
11-9071.00
11-9081.00
11-9111.00
11-9121.00
11-9121.01
11-9121.02
11-9131.00
11-9141.00
11-9151.00
11-9161.00
11-9199.01
11-9199.02
11-9199.03
11-9199.04
11-9199.07
11-9199.08
11-9199.09
11-9199.10
11-9199.11
13-1011.00
13-1021.00
13-1022.00
13-1023.00
13-1031.01
13-1031.02
13-1032.00
13-1041.01
13-1041.02
13-1041.03
13-1041.04
13-1041.06
13-1041.07
13-1051.00
13-1071.00
13-1074.00
13-1075.00
13-1081.00
13-1081.01
13-1081.02
13-1111.00
13-1121.00
13-1131.00
13-1141.00
13-1151.00
13-1161.00
13-1199.01
13-1199.02
13-1199.03
13-1199.04
13-1199.05
13-1199.06
13-2011.01
13-2011.02
13-2021.01

In [885]:
df = pd.read_csv('jobdescription.csv', index_col=0)

In [886]:
def cellParser(string):
    if isinstance(string, (int, float, complex, list, dict, set)):
        return string
    if str(string) == 'nan':
        return None
    if re.search('^\[.*\]$', string):
        return [i.strip() for i in re.sub('(\'|\")', '', string[1:-1]).split(',')]
    if re.search('^\{.*\}$', string):
        string = re.sub('(?<=\W)\'(?!=\W)', '\"', string)
        string = re.sub('(?<!\W)\'(?=\W)', '\"', string)
        string = re.sub('None', '\"None\"', string)
        string = re.sub('(?<!\W),', ' ', string)
        try:
            return json.loads(string)
        except:
            print(string)
            print()
            print()
    return string

In [887]:
for column in df:
    df[column] = df[column].apply(lambda x: cellParser(x))

In [888]:
df.head()

Unnamed: 0,Abilities,DetailedWorkActivities,Education,Interests,Job Zone,Knowledge,Occupation listing,Skills,Tasks,Technology,Tools,Wages & Employment Trends,Work Values,WorkActivities,WorkContext,WorkStyles,description,id,name,titles
0,{'Speech Recognition The ability to identify ...,"{'Prepare operational budgets.': 'None', 'Advi...","{'26': 'Masters degree', '22': 'Bachelors degr...",{'Conventional Conventional occupations frequ...,{'Job Zone Examples': 'These occupations often...,{'Mathematics Knowledge of arithmetic algebr...,"{'11-3061.00': 'Purchasing Managers', '11-9111...",{'Management of Material Resources Obtaining ...,{'Implement corrective action plans to solve o...,{'Financial analysis software Microsoft FRx':...,"{'Desktop computers': 'None', 'Notebook comput...",{'Median wages (2015)': '$84.19 hourly $175 1...,{'Independence Occupations that satisfy this ...,{'Analyzing Data or Information Identifying t...,{'Deal With Unpleasant or Angry People 41% re...,{'Concern for Others Job requires being sensi...,Determine and formulate policies and provide o...,11-1011.00,Chief Executives,"[Chief Nursing Officer, Executive Director, Op..."
1,{'Speech Recognition The ability to identify ...,{'Identify opportunities for green initiatives...,"{'42': 'Bachelors degree', '31': 'Masters degr...",{'Conventional Conventional occupations frequ...,{'Job Zone Examples': 'These occupations often...,{'Engineering and Technology Knowledge of the...,,{'Coordination Adjusting actions in relation ...,{'Supervise employees or volunteers working on...,{'Calendar and scheduling software Scheduling...,{'Photocopiers Photocopying equipment': 'None...,{'Median wages (2015)': '$84.19 hourly $175 1...,{'Independence Occupations that satisfy this ...,{'Analyzing Data or Information Identifying t...,{'Indoors Environmentally Controlled 77% res...,{'Concern for Others Job requires being sensi...,"Communicate and coordinate with management, sh...",11-1011.03,Chief Sustainability Officers,"[Supply Chain Manager, Vice President]"
2,{'Speech Recognition The ability to identify ...,{'Manage environmental sustainability projects...,"{'29': 'Bachelors degree', '15': 'High school ...",{'Social Social occupations frequently involv...,{'Job Zone Examples': 'Many of these occupatio...,{'Mathematics Knowledge of arithmetic algebr...,"{'11-9051.00': 'Food Service Managers', '11-30...",{'Management of Material Resources Obtaining ...,{'Direct administrative activities directly re...,{'Transaction security and virus protection so...,{'Magnetic stripe readers and encoders Credit...,{'Median wages (2015)': '$46.99 hourly $97 73...,{'Independence Occupations that satisfy this ...,{'Analyzing Data or Information Identifying t...,{'Importance of Repeating Same Tasks 35% resp...,{'Attention to Detail Job requires being care...,"Plan, direct, or coordinate the operations of ...",11-1021.00,General and Operations Managers,"[Business Manager, Facilities Manager, Facilit..."
3,{'Speech Recognition The ability to identify ...,"{'Prepare operational budgets.': 'None', 'Esta...","{'14': 'Some college no degree', '54': 'Bache...",{'Conventional Conventional occupations frequ...,{'Job Zone Examples': 'Many of these occupatio...,{'Communications and Media Knowledge of media...,"{'27-1011.00': 'Art Directors', '13-1121.00': ...",{'Coordination Adjusting actions in relation ...,{'Read trade journals and professional literat...,{'Analytical or scientific software Business ...,"{'Tablet computers': 'None', 'Scanners': 'None...",{'Median wages (2015)': '$46.10 hourly $95 89...,{'Independence Occupations that satisfy this ...,{'Getting Information Observing receiving a...,{'Importance of Being Exact or Accurate 51% r...,{'Concern for Others Job requires being sensi...,"Plan, direct, or coordinate advertising polici...",11-2011.00,Advertising and Promotions Managers,"[Account Executive, Advertising Director, Adve..."
4,{'Speech Recognition The ability to identify ...,{'Develop marketing plans or strategies.': 'No...,"{'56': 'Bachelors degree', '11': 'Professional...",{'Conventional Conventional occupations frequ...,{'Job Zone Examples': 'Many of these occupatio...,{'Mathematics Knowledge of arithmetic algebr...,"{'11-3061.00': 'Purchasing Managers', '11-3131...",{'Operations Analysis Analyzing needs and pro...,{'Compile lists describing product or service ...,{'Expert system software Oracle Beehive': 'No...,"{'Tablet computers': 'None', 'Scanners': 'None...",{'Median wages (2015)': '$61.90 hourly $128 7...,{'Independence Occupations that satisfy this ...,{'Analyzing Data or Information Identifying t...,{'Freedom to Make Decisions 49% responded Som...,{'Concern for Others Job requires being sensi...,"Plan, direct, or coordinate marketing policies...",11-2021.00,Marketing Managers,"[Account Supervisor, Brand Manager, Business D..."


In [896]:
df['Education'].iloc[0]

{'22': 'Bachelors degree', '26': 'Masters degree'}