In [90]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import quote_plus, urlencode, urlparse, urlunparse
import pandas as pd
import re
import json
import lxml.html

In [144]:
data_path = "CleanedDescription.csv"
df = pd.read_csv(data_path)

In [145]:
df.columns.values

array(['Unnamed: 0', 'IBCSplitCode', 'IBCCodeDescription', 'CleanText'], dtype=object)

# Fetch Wikipedia pages

For every descriptor (leaf) we will fetch wikipedia entries returned by bing

In [162]:
code_url_map = {}

with open('codes_url_map.txt', 'w') as f:
    for code, description in zip(df.IBCSplitCode, df.CleanText.values):
        clean_description = re.sub(r"\b(noc|\n\.o\.c\.|incl|etc\.|_|\(|\)|e\.g\.|other|otherthanpoultry|coc)\b", "", description)
        clean_description = re.sub(r"\bmanufactur\.\b", "manufacturing", clean_description)
        clean_description = re.sub(r"otherthan.*$", "", clean_description)
        
        payload = {"q":description}
        result = urlencode(payload, quote_via=quote_plus)
        r = urlopen("http://www.bing.com/search?" + result).read()
        dom =  lxml.html.fromstring(r)
        url_list = dom.xpath('//a/@href')
        code_url_map[code] = url_list
        
        f.write(code + ":" + str(url_list) + "\n")

In [170]:
import pickle

pickle.(code_url_map, open('code_url_map.pkl', 'wb'))

# Clean url map

## We get the wikipedia links

In [23]:
import pickle

code_url_map = pickle.load(open('code_url_map.pkl', 'rb'))

In [24]:
len(code_url_map)

1203

In [25]:
for key in code_url_map.keys():
    code_url_map[key] = [urlparse(url).netloc + urlparse(url).path for url in code_url_map[key] if "wikipedia" in url]

In [28]:
code_url_map = {k: list(set(v)) for k, v in code_url_map.items() if v}

In [105]:
code_url_map

{'0121a00': ['en.wikipedia.org/wiki/Nut_(fruit)'],
 '0121b00': ['simple.wikipedia.org/wiki/List_of_fruits',
  'en.wikipedia.org/wiki/Fruitarianism'],
 '0121d00': ['en.wikipedia.org/wiki/Potato_farm'],
 '0122a00': ['en.wikipedia.org/wiki/Cover_crop'],
 '0123a00': ['en.wikipedia.org/wiki/Greenhouse'],
 '0130a00': ['en.wikipedia.org/wiki/Dairy',
  'en.wikipedia.org/wiki/Dairy_farming'],
 '0140a00': ['en.wikipedia.org/wiki/Horse_breeding'],
 '0150a20': ['en.wikipedia.org/wiki/Egg_(food)',
  'en.wikipedia.org/wiki/Poultry_farming'],
 '0150z00': ['en.wikipedia.org/wiki/Poultry_farming',
  'en.wikipedia.org/wiki/Poultry'],
 '0160a00': ['en.wikipedia.org/wiki/Beefalo'],
 '0170a00': ['en.wikipedia.org/wiki/Hog_Farm',
  'en.wikipedia.org/wiki/Pig_farming'],
 '0190a00': ['en.wikipedia.org/wiki/Goat-sheep_chimerae',
  'en.wikipedia.org/wiki/Special:Random',
  'en.wikipedia.org/wiki/Wikipedia:Contact_us',
  'en.wikipedia.org/wiki/Sheep%E2%80%93goat_hybrid',
  'en.wikipedia.org/wiki/Sheep–goat_hybri

# Fetch Wikipedia Text

In [99]:
query = "Goat-sheep_chimerae"
req = urlopen("https://en.wikipedia.org/w/api.php?format=json&origin=*&action=query&prop=extracts&exlimit=max&redirects=true&titles="+query)

In [100]:
response = json.loads(req.read().decode("utf-8"))
response['query']['pages']

{'734858': {'extract': '<p>A <b>sheep–goat chimera</b> (sometimes called a <b>geep</b> in popular media) is a chimera produced by combining the embryos of a goat and a sheep; the resulting animal has cells of both sheep and goat origin. A sheep-goat chimera should not be confused with a sheep-goat hybrid, which can result when a goat mates with a sheep.</p>\n<h2><span id="History">History</span></h2>\n<p>The first sheep-goat chimeras were created by researchers at the Institute of Animal Physiology in Cambridge, England by combining sheep embryos with goat embryos. They reported their results in 1984. The successful chimeras were a mosaic of goat and sheep tissue. The parts that grew from the sheep embryo were woolly. Those that grew from the goat embryo were hairy.</p>\n<h2><span id="Characteristics">Characteristics</span></h2>\n<p>In a chimera, each set of cells (germ line) keeps its own species\' identity instead of being intermediate in type between the parental species. Because th

In [57]:
for idx in iter(response['query']['pages']):
    text = response['query']['pages'][idx]['extract']

In [58]:
import html2text

In [67]:
cleaner = re.compile('<.*?>')
cleantext = re.sub(cleaner, '', text)
print(cleantext)

A sheep–goat chimera (sometimes called a geep in popular media) is a chimera produced by combining the embryos of a goat and a sheep; the resulting animal has cells of both sheep and goat origin. A sheep-goat chimera should not be confused with a sheep-goat hybrid, which can result when a goat mates with a sheep.
History
The first sheep-goat chimeras were created by researchers at the Institute of Animal Physiology in Cambridge, England by combining sheep embryos with goat embryos. They reported their results in 1984. The successful chimeras were a mosaic of goat and sheep tissue. The parts that grew from the sheep embryo were woolly. Those that grew from the goat embryo were hairy.
Characteristics
In a chimera, each set of cells (germ line) keeps its own species' identity instead of being intermediate in type between the parental species. Because the chimera contains cells from two different genetic individuals, and each of these arose by normal mating, it has four parents. In contras

In [64]:

h = html2text.HTML2Text()
h.ignore_emphasis = True
h.ignore_images = True
h.ignore_links = True
h.ignore_tables = True
a = h.handle(text)
print(text)

<p>A <b>sheep–goat chimera</b> (sometimes called a <b>geep</b> in popular media) is a chimera produced by combining the embryos of a goat and a sheep; the resulting animal has cells of both sheep and goat origin. A sheep-goat chimera should not be confused with a sheep-goat hybrid, which can result when a goat mates with a sheep.</p>
<h2><span id="History">History</span></h2>
<p>The first sheep-goat chimeras were created by researchers at the Institute of Animal Physiology in Cambridge, England by combining sheep embryos with goat embryos. They reported their results in 1984. The successful chimeras were a mosaic of goat and sheep tissue. The parts that grew from the sheep embryo were woolly. Those that grew from the goat embryo were hairy.</p>
<h2><span id="Characteristics">Characteristics</span></h2>
<p>In a chimera, each set of cells (germ line) keeps its own species' identity instead of being intermediate in type between the parental species. Because the chimera contains cells from

In [63]:
a.replace("\n", "")

"A sheep–goat chimera (sometimes called a geep in popular media) is a chimeraproduced by combining the embryos of a goat and a sheep; the resulting animalhas cells of both sheep and goat origin. A sheep-goat chimera should not beconfused with a sheep-goat hybrid, which can result when a goat mates with asheep.## HistoryThe first sheep-goat chimeras were created by researchers at the Institute ofAnimal Physiology in Cambridge, England by combining sheep embryos with goatembryos. They reported their results in 1984. The successful chimeras were amosaic of goat and sheep tissue. The parts that grew from the sheep embryowere woolly. Those that grew from the goat embryo were hairy.## CharacteristicsIn a chimera, each set of cells (germ line) keeps its own species' identityinstead of being intermediate in type between the parental species. Becausethe chimera contains cells from two different genetic individuals, and each ofthese arose by normal mating, it has four parents. In contrast, a hyb

In [101]:
import os

root = os.path.join(os.path.expanduser('~'), 'part', 'corpus')
if not os.path.exists(root):
    os.makedirs(root)

In [71]:
import random
random.seed(123)

In [84]:
def get_wikipedia_text(response):
    text = []
    for idx in iter(response['query']['pages']):
        if 'extract' in response['query']['pages'][idx]:
            text.append(response['query']['pages'][idx]['extract'])
    
    return text

In [91]:
def urlEncodeNonAscii(b):
    return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)

def iriToUri(iri):
    parts= urlparse(iri)
    return urlunparse(
        part.encode('idna') if parti==1 else urlEncodeNonAscii(part.encode('utf-8'))
        for parti, part in enumerate(parts)
    )


In [103]:
cleaner = re.compile('<.*?>')
blacklist = ['Help:', 'Wikipedia:', 'Special:', 'Portal:']
for code, urls in code_url_map.items():
    code_path = os.path.join(root, code)
    os.makedirs(code_path, exist_ok=True)
    
    with open(os.path.join(code_path, 'linkmap.tsv'), 'w') as linkmap:
        for url in urls:
            query = quote_plus(url.split('/')[-1])
            req = urlopen("https://en.wikipedia.org/w/api.php?format=json&origin=*&action=query&prop=extracts&exlimit=max&redirects=true&titles="+query)
            response = json.loads(req.read().decode("utf-8"))
            text = get_wikipedia_text(response)
            
            if not text:
                continue
              
            cleantext = re.sub(cleaner, '', text[0])
            
            file_name = query + "-" + str(random.randint(0,30000))
            linkmap.write(url + '\t' + str(file_name) + '.txt\n')
            
            with open(os.path.join(code_path, str(file_name) + '.html'), 'w') as o:
                o.write(text[0])
            with open(os.path.join(code_path, str(file_name) + '.txt'), 'w') as o:
                o.write(cleantext)

In [106]:
path = os.path.join(os.path.expanduser('~'), 'part', 'corpus')
os.listdir(path)

['7925a00',
 '8083a00',
 '1525b00',
 '7294a00',
 '8943a00',
 '3982a00',
 '7391a00',
 '0190z10',
 '7408h00',
 '2001a00',
 '7982a00',
 '7937a00',
 '7408p00',
 '5991b00',
 '0190d00',
 '6650a10',
 '5998a00',
 '8673a00',
 '1713b00',
 '5421a00',
 '3590f00',
 '7054a00',
 '7693a00',
 '8111b00',
 '1403a00',
 '5814a00',
 '1731x00',
 '4951a00',
 '8080g00',
 '6412a00',
 '7312c00',
 '7541a00',
 '7928a00',
 '0735a00',
 '1764a00',
 '8650a00',
 '4891a00',
 '5122a00',
 '5993b00',
 '3496a00',
 '1511a00',
 '0140a00',
 '5055a00',
 '2331a00',
 '5259a00',
 '0121d00',
 '1394a60',
 '8026a00',
 '5212a00',
 '5045a00',
 '1534a00',
 '7041b00',
 '7923b00',
 '8062z00',
 '3073c00',
 '6390a00',
 '1770a00',
 '8085z00',
 '7408b10',
 '3983a00',
 '8947a00',
 '1101a00',
 '8944a00',
 '6580a00',
 '4812a00',
 '3821a00',
 '1812a00',
 '2990a00',
 '2003a00',
 '8014a00',
 '3590b00',
 '7408c20',
 '4113a00',
 '7999c00',
 '1715b00',
 '1394h10',
 '1771a00',
 '1397b10',
 '1395c10',
 '6150a00',
 '1624a00',
 '1631x00',
 '1524a00',
 '52