In [None]:
import requests
import re
from bs4 import BeautifulSoup
import time
import pandas as pd
import sys
import os
import json

In [None]:
mysession=requests.Session()
html_text=mysession.get('https://www.onetonline.org/link/summary/11-3051.00').text
soup = BeautifulSoup(html_text, 'html.parser') 

In [None]:
class HTMLparser:
    def __init__(self, HTMLdoc):
        self.root = BeautifulSoup(HTMLdoc, 'html.parser')
        for t in self.root(['script', 'style', 'meta', 'link', 'head', 'img']):
            t.extract()
        self.headSection = self.root.find('div', {'id': 'content'})
        self.types = {'A': ['Tasks',
                            'Knowledge',
                            'Skills',
                            'Abilities',
                            'WorkActivities',
                            'DetailedWorkActivities',
                            'WorkContext',
                            'WorkStyles'],
                      'B': ['ToolsTechnology'],
                      'C': ['Job Zone','Wages & Employment Trends'],
                      'D': ['Education'],
                      'E': ['Interests', 'Work Values'],
                      'F': ['Occupation listing']
                      }
                
    def display(self, *tag):
        if not tag:
            tag = self.root
        print(tag.prettify())
        
    def head(self):
        headTag = self.headSection.span.text
        return headTag.split('-')[2].strip()
    
    def jobID(self):
        headTag = self.headSection.span.text
        return headTag.split()[0].strip()
    
    def description(self):
        return self.headSection.find_all('p')[0].text
    
    def titles(self):
        return [i.strip() for i in re.findall('([a-zA-Z0-9 ]+),', self.headSection.find_all('p')[1].text)]
    
    def dictParser(self, text, convertList=False):
        newDict = {}
        text = re.sub('[^\x00-\x7F]', '', text)
        text = re.sub('[\"\']', '', text)
        for i in text.split('\n'):
            if i:
                try:
                    newPair = i.split('—')
                    newDict[newPair[0].strip()] = newPair[1].strip()
                except IndexError:
                    newDict[newPair[0].strip()] = None
        if convertList and not any(newDict.values()):
            newDict = list(newDict)
        return newDict
    
    def tableParser(self, trs):
        newDict = {}
        for tr in trs:
            tds = tr.find_all('td')
            try:
                key = re.sub('[^\x00-\x7F]', '', tds[0].text) 
                key = text = re.sub('[\"\']', '', key)
            except IndexError:
                continue
            if key:
                try: 
                    value = re.sub('[^\x00-\x7F]', '', tds[1].text) 
                    value = re.sub('[\"\']', '', value)
                    newDict[key.strip()] = value.strip()        
                except IndexError:
                    value = None
                    newDict[key.strip()] = value  
        return newDict
      
    def typeAParser(self, section):  # list 1 section
        try:
            headTag = self.headSection.find('div', {'class': 'section_' + section}, recursive=False)
            return self.dictParser(headTag.find_all('ul')[0].text)
        except AttributeError:
            return None   
    
    def typeBParser(self, section):  # list 2 sections
        try:
            headTag = self.headSection.find('div', {'class': 'section_' + section}, recursive=False)
            uls = headTag.find_all('ul')
            for ul in uls:
                yield self.dictParser(ul.text)
            yield None
        except AttributeError:
            for i in range(3):
                yield None
    
    def typeCParser(self, section):  # table no headline
        try:
            headTag = self.headSection.find('table', 
                                            {'summary': section + ' information for this occupation'}, 
                                            recursive=True)
            tableItem = headTag.find_all('tr')
            return self.tableParser(tableItem)
        except AttributeError:
            return None
        
    def typeDParser(self, section): # table with headline
        try:
            headTag = self.headSection.find('table', 
                                            {'summary': section + ' information for this occupation'}, 
                                            recursive=True)
            tableItem = headTag.find_all('tr')[1:]
            return self.tableParser(tableItem)
        except AttributeError:
            return None
        
    def typeEParser(self, section):  # find in next closet siblings
        try:
            nextNode = self.headSection.find('h3', text=section, recursive=False)
            while True:
                nextNode = nextNode.nextSibling
                if nextNode.name == 'ul':
                    return self.dictParser(nextNode.text)
        except AttributeError:
            return None
            
    def typeFParser(self, section):  # table no headline
        try:
            headTag = self.headSection.find('table', 
                                            {'summary': section}, 
                                            recursive=True)
            tableItem = headTag.find_all('tr')
            return self.tableParser(tableItem)
        except AttributeError:
            return None
            
    def parse(self):
        self.features = {}
        self.features['id'] = self.jobID()
        self.features['name'] = self.head()
        self.features['description'] = self.description()
        self.features['titles'] = self.titles()
        for oneType, sections in self.types.items():
            if oneType == 'B':
                try:
                    self.features['Tools'], self.features['Technology'] = list(self.typeBParser(sections[0]))[0:2]
                except AttributeError:
                    self.features['Tools'], self.features['Technology'] = (None, None)
            else:
                for oneSection in sections:
                    try:
                        self.features[oneSection] = getattr(self, 'type' + oneType + 'Parser')(oneSection)
                    except AttributeError:
                        self.features[oneSection] = None
        return self.features
        

In [None]:
def readCodes(fname='ONET_code.csv'):
    df = pd.read_csv(fname, header=None)
    return df[1]

df = readCodes()

def readDescrip(df):
    datalist = []
    session = requests.Session()
    mainUrl = 'https://www.onetonline.org/link/summary/'
    for code in df:
        print(code)
        time.sleep(0.1)
        codehtml = session.get(mainUrl + str(code))
        datalist.append(HTMLparser(codehtml.text).parse())
    alljobs = pd.DataFrame(datalist)
    alljobs.to_csv('jobdescription.csv')
        
readDescrip(df)
        

In [None]:
df = pd.read_csv('jobdescription.csv', index_col=0)

In [None]:
def cellParser(string):
    if isinstance(string, (int, float, complex, list, dict, set)):
        return string
    if str(string) == 'nan':
        return None
    if re.search('^\[.*\]$', string):
        return [i.strip() for i in re.sub('(\'|\")', '', string[1:-1]).split(',')]
    if re.search('^\{.*\}$', string):
        string = re.sub('(?<=\W)\'(?!=\W)', '\"', string)
        string = re.sub('(?<!\W)\'(?=\W)', '\"', string)
        string = re.sub('None', '\"None\"', string)
        string = re.sub('(?<!\W),', ' ', string)
        try:
            return json.loads(string)
        except:
            print(string)
            print()
            print()
    return string

In [None]:
for column in df:
    df[column] = df[column].apply(lambda x: cellParser(x))