In [1]:
import pandas as pd
import re
from lxml import etree

In [2]:
%%time
# get all <text> element
tree = etree.parse("enwiki-20190520-pages-articles-multistream1.xml")
root = tree.getroot()
text_elem = root.findall('.//page//text', root.nsmap)

Wall time: 4.04 s


In [3]:
len(text_elem)

19821

Query `declare default element namespace "http://www.mediawiki.org/xml/export-0.10/";//page//text` in BaseX, got:
```
Result:
- Hit(s): 19821 Items
```
Count checked.

In [4]:
%%time
lines_with_infobox = [e.text.splitlines() for e in text_elem if '{{Infobox' in e.text]
len(lines_with_infobox)

Wall time: 2.62 s


In [5]:
# extract info
def extract_infoboxes(lines, infobox_type_filter=[]):
    infoboxes = []
    infobox = []
    bracket_stack = 0
    cur_line_is_infobox = False
    
    for l in lines:
        l = l.strip()
        if not cur_line_is_infobox and l.strip()[:9] == '{{Infobox':
            if infobox_type_filter:
                if l[2:].strip() in infobox_type_filter:
                    cur_line_is_infobox = True
            else:
                cur_line_is_infobox = True
        if cur_line_is_infobox:
            bracket_stack = bracket_stack + len(re.findall('{{', l)) - len(re.findall('}}', l))
            infobox.append(l)
            if bracket_stack <= 0:
                cur_line_is_infobox = False
                infoboxes.append(infobox)
                infobox = []
                bracket_stack = 0
    return infoboxes

In [6]:
infoboxes = [extract_infoboxes(lines) for lines in lines_with_infobox]

In [7]:
ifbs = []
for b in infoboxes:
    ifbs += b
print(len(infoboxes), len(ifbs))

6602 7141


In [133]:
ifbs_text = []
for ifb in ifbs:
    ifbs_text.append(''.join(ifb))

In [134]:
def infobox_bracket_pair_check(text, pairs=['{}','[]','()','<>']):
    for p in pairs:
        if text.count(p[0]) != text.count(p[1]):
            return False
    return True

In [135]:
cleaned_ifbs_text = [t for t in ifbs_text if infobox_bracket_pair_check(t)]

In [136]:
print(len(ifbs_text) - len(cleaned_ifbs_text))

102


In [140]:
dic = {}
dic[''] = ''

In [141]:
dic

{'': ''}

In [285]:
def parse_infobox(text):
    text = '{{|infobox_type=' + text[2:] # convert infobox info format
    kvs = {}
    
    benchmark_bracket = [2,0,0,0] # {}, [], (), <>
    cur_bracket = [2,0,0,0]
    
    cur_state = 'tk' # k: key, v: value, tk; to key, tv: to value
    key_cache = ''
    val_cache = ''
    # state for c: keyword, key, value
#     print(text)
    for c in text[2:]:
#         print(c, cur_bracket)
        if c in '|{}[]()<>=':
            # handle {} [] () <>
            if c == '|':
                if cur_bracket > benchmark_bracket:
                    if cur_state == 'v':
                        val_cache += c
                    else:
                        print('e1', cur_bracket, c)
                        return kvs
                elif cur_bracket == benchmark_bracket:
                    key_cache = key_cache.strip()
                    if key_cache:
                        kvs[key_cache] = val_cache.strip()
                    key_cache = val_cache = ''                    
                    cur_state = 'tk'
                else:
                    print('e2')
                    return
            elif c == '=':
#                 print('got =')
                if cur_bracket > benchmark_bracket:
#                     print('got = 1')
                    if cur_state == 'v':
                        val_cache += c
                    else:
                        print('e3')
                        return
                elif cur_bracket == benchmark_bracket:
#                     print('got = 2')
                    cur_state = 'tv'
                else:
                    print('e4')
                    return
            else:
                if c == '{':
                    cur_bracket[0] += 1
                elif c == '}':
                    cur_bracket[0] -= 1
                elif c == '[':
                    cur_bracket[1] += 1
                elif c == ']':
                    cur_bracket[1] -= 1
                elif c == '(':
                    cur_bracket[2] += 1
                elif c == ')':
                    cur_bracket[2] -= 1
                elif c == '<':
                    cur_bracket[3] += 1
                elif c == '>':
                    cur_bracket[3] -= 1
                if cur_bracket >= benchmark_bracket:
                    val_cache += c
                    if cur_bracket > benchmark_bracket:
                        cur_state = 'v'
        else:
            if cur_state == 'k' or cur_state == 'tk':
                key_cache += c
            elif cur_state == 'v' or cur_state == 'tv':
                val_cache += c
#             elif cur_state == 'tk':

#                 key_cache = val_cache = ''
        if cur_bracket < benchmark_bracket:
            key_cache = key_cache.strip()
            if key_cache:
                kvs[key_cache] = val_cache.strip()
            return kvs
    return kvs

In [286]:
t1 = '{{|infobox_type=Infobox sports conference|title=American Football Conference|league=[[National Football League]]|logo=American Football Conference logo.svg|caption=American Football Conference logo (2010–present)|pixels=150 px|formerly=[[American Football League]] (AFL)|sport=[[American football]]|founded=1970|teams=16|most_champs=[[New England Patriots]] (11 titles)|champion=[[New England Patriots]] (11th title)}}'
t = '{{t|a={<<b>>}|c=d  [[2]}}'

In [287]:
parse_infobox(t)

{'infobox_type': 't', 'a': '{<<b>>}', 'c': 'd  [[2]'}

In [288]:
'  a   b c '.strip()

'a   b c'

In [299]:
test_text = cleaned_ifbs_text[13]
print(test_text)

{{Infobox scientist|name              = Alain Connes|image             = Alain_Connes.jpg|alt               = Photo of the upper body of Alain Connes with vegetation, blue sky, and clouds in the background|caption           = Alain Connes in 2004|birth_date        = {{birth date and age|1947|04|01|df=y}}|birth_place       = [[Draguignan]], France|death_date        =|death_place       =|nationality       = French|field             = [[Mathematics]]|work_institutions = [[IHÉS]], France|alma_mater        = [[École Normale Supérieure]] <br> [[Pierre and Marie Curie University]]|doctoral_advisor  = [[Jacques Dixmier]]|doctoral_students = [[Georges Skandalis]]|known_for         = [[Baum–Connes conjecture]]<br>[[Noncommutative geometry]]<br>[[Operator algebra]]s<br>[[Thermal time hypothesis]]|prizes            = [[CNRS]] Silver Medal (1977)<br/> [[Prize Ampère]] (1980)<br/> [[Fields Medal]] (1982)<br/> [[Clay Research Award]] (2000)<br/> [[Crafoord Prize]] (2001)<br/> [[CNRS Gold medal]] (200

In [301]:
parse_infobox(test_text)

18

In [304]:
%%time
parsed = [parse_infobox(t) for t in cleaned_ifbs_text]

Wall time: 4.77 s


In [305]:
parsed

[{'infobox_type': 'Infobox medical condition (new)',
  'name': 'Autism',
  'image': 'Autism-stacking-cans 2nd edit.jpg',
  'alt': 'Boy stacking cans',
  'caption': 'Repetitively stacking or lining up objects is associated with autism.',
  'field': '[[Psychiatry]]',
  'symptoms': 'Trouble with [[Interpersonal relationship|social interaction]], impaired [[communication]], restricted interests, repetitive behavior<ref name=Land2008/>',
  'complications': '',
  'onset': 'By age two or three<ref name=NIH2016>{{cite web |title= NIMH " Autism Spectrum Disorder |url= https://www.nimh.nih.gov/health/topics/autism-spectrum-disorders-asd/index.shtml |website= nimh.nih.gov |accessdate= 20 April 2017 |language=en |date= October 2016}}</ref><ref name=DSM5/>',
  'duration': 'Long-term<ref name=NIH2016/>',
  'causes': '[[Heritability of autism|Genetic]] and environmental factors<ref name=Ch2012/>',
  'risks': '',
  'diagnosis': 'Based on behavior and developmental history<ref name=NIH2016/>',
  'diffe

## Special condition:
### Infobox in one line:
```
{{Infobox sports conference|title=American Football Conference|league=[[National Football League]]|logo=American Football Conference logo.svg|caption=American Football Conference logo (2010–present)|pixels=150 px|formerly=[[American Football League]] (AFL)|sport=[[American football]]|founded=1970|teams=16|most_champs=[[New England Patriots]] (11 titles)|champion=[[New England Patriots]] (11th title)}}
```

### nested Infobox:
```
{{Infobox president
| name          = Ruhollah Khomeini
| native_name   = {{lang|fa|سید روح‌الله موسوی خمینی}}
...
| signature     = Ruhollah Khomeini signature.png
| module        =
    {{Infobox religious biography
    | title = [[Grand Ayatollah]]
    | embed             = yes
    | religion         = [[Islam]]
...
```

In [147]:
import numpy as np

In [148]:
np.add([1],[2])

array([3])

In [144]:
[2,1,3,4] + [1,2,1,4]

[2, 1, 3, 4, 1, 2, 1, 4]

In [62]:
for i in ifbs:
    for ii in i:
        if '\'' in ii:
            print(i)
            break
            break

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




['{{Infobox fictional location', '| name       = Glorantha', '| colour     = #C0C0C0', '| image      = Glorantha.png', '| imagesize  = 250px', '| caption    = Map of Glorantha by Christophe Dang Ngoc Chan', "| source     = ''[[RuneQuest]]'', etc.", '| creator    = [[Greg Stafford]]', '| genre      = [[Role-playing game]]', '| type       = [[Fantasy world]]', '| locations  = [[Genertela]], Pamaltela', '| people     =', '}}']
['{{Infobox ice hockey player', '| position = [[Winger (ice hockey)|Left Wing]]', '| shoots = Left', '| height_ft = 5', '| height_in = 11', '| weight_lb = 180', "| played_for = '''[[National Hockey League|NHL]]'''<br>[[Boston Bruins]]<br>[[Detroit Red Wings]]<br>[[St. Louis Blues]]<br>[[Washington Capitals]]<br>'''[[World Hockey Association|WHA]]'''<br>[[Edmonton Oilers]]", '| birth_date = {{birth date|1948|6|13|mf=y}}', '| birth_place = [[Lloydminster]], [[Saskatchewan]], Canada', '| career_start = 1968', '| career_end = 1979', '| draft = 13th overall', '| draft_y

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [50]:
inv_ifbs = []
for i in ifbs:
    if i[0][:2] != '{{':
        inv_ifbs.append(i)

In [51]:
inv_ifbs

[]

In [44]:
inv_ifbs = []
for i in ifbs:
    for ii in i:
        if '|||' in ii:
            inv_ifbs.append(i)
            break

In [46]:
inv_ifbs

[['{{Infobox royalty',
  '|name= Bilgä Qaǧan <br> [[Old Turkic]]: 𐰋𐰃𐰠𐰏𐰀:𐰴𐰍𐰣 <br> [[Chinese language|Chinese]]: 毗伽可汗',
  '|title= Fourth [[Khagan|Qaghan]] of the [[Second Turkic Qaghanate]]',
  '|image=',
  '|caption=',
  '|succession=',
  '|birth_name=Ashina Mojilian <br> 阿史那默棘連|birth_date= 683',
  '|birth_place=',
  '|death_date= {{death date and age|734|11|25|683|||df=yes}}',
  '|death_place= [[Otukan]]',
  '|full name=',
  '|father=[[Ilterish Qaghan|Ilterish Khagan]]',
  '|mother= El Bilga Khatun',
  '|spouse= [[Po Beg|El Etmish Bilge Khatun]]',
  '|issue=[[Yollıg Khagan|Yollıg Tigin]]<br>[[Bilge Kutluk Khaghan|Tengri Tigin]] <br> Princess Daluo <br> Two unnamed sons',
  '|reign= February 717 – 25 November 734',
  '|coronation=',
  '|othertitles=',
  '|predecessor=[[Inel Qaghan]]',
  '|regent=[[Tonyukuk]]|successor=[[Yollıg Khagan]]',
  '|house=[[Ashina tribe|House of Ashina]]',
  '|religion= [[Tengrism]]',
  '}}'],
 ['{{Infobox writer',
  '| name         = Patrick White',
  '| imag

## Idea:
change all root '|' to '||' and use '||' to split.
### Modified:
change root '|' to '|-split_tag-|'. There are '||'s in Infobox text.

In [38]:
inv_ifbs[6][-1]

'{{Infobox antimony}}'

In [8]:
str_ifbs = ['| infobox_type = ' + '\n'.join(b)[2:-2] for b in ifbs]

In [9]:
len(str_ifbs)

7141

In [88]:
s = str_ifbs[0]

In [89]:
print(s)

| infobox_type = Infobox medical condition (new)
| name            = Autism
| image           = Autism-stacking-cans 2nd edit.jpg
| alt             = Boy stacking cans
| caption         = Repetitively stacking or lining up objects is associated with autism.
| field           = [[Psychiatry]]
| symptoms        = Trouble with [[Interpersonal relationship|social interaction]], impaired [[communication]], restricted interests, repetitive behavior<ref name=Land2008/>
| complications   =
| onset           = By age two or three<ref name=NIH2016>{{cite web |title= NIMH " Autism Spectrum Disorder |url= https://www.nimh.nih.gov/health/topics/autism-spectrum-disorders-asd/index.shtml |website= nimh.nih.gov |accessdate= 20 April 2017 |language=en |date= October 2016}}</ref><ref name=DSM5/>
| duration        = Long-term<ref name=NIH2016/>
| causes          = [[Heritability of autism|Genetic]] and environmental factors<ref name=Ch2012/>
| risks           =
| diagnosis       = Based on behavior and d

In [90]:
ss = s.splitlines()
ss

['| infobox_type = Infobox medical condition (new)',
 '| name            = Autism',
 '| image           = Autism-stacking-cans 2nd edit.jpg',
 '| alt             = Boy stacking cans',
 '| caption         = Repetitively stacking or lining up objects is associated with autism.',
 '| field           = [[Psychiatry]]',
 '| symptoms        = Trouble with [[Interpersonal relationship|social interaction]], impaired [[communication]], restricted interests, repetitive behavior<ref name=Land2008/>',
 '| complications   =',
 '| onset           = By age two or three<ref name=NIH2016>{{cite web |title= NIMH " Autism Spectrum Disorder |url= https://www.nimh.nih.gov/health/topics/autism-spectrum-disorders-asd/index.shtml |website= nimh.nih.gov |accessdate= 20 April 2017 |language=en |date= October 2016}}</ref><ref name=DSM5/>',
 '| duration        = Long-term<ref name=NIH2016/>',
 '| causes          = [[Heritability of autism|Genetic]] and environmental factors<ref name=Ch2012/>',
 '| risks          

In [94]:
last_line_ends = True
kvs = {} # key-value pairs
for i, t in enumerate(ss):
    if last_line_ends:
        pass
    else:
        pass

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [93]:
a.split('=')

['| symptoms        ',
 ' Trouble with [[Interpersonal relationship|social interaction]], impaired [[communication]], restricted interests, repetitive behavior<ref name',
 'Land2008/>']

In [None]:
def parse(s, open='{[<(', close='}]>)', ):
    

In [None]:
class Infobox:
    def __init__(self, text):
        self.text = text
    def parse(self):
        pass

In [68]:
'abcdefg'[2:-2]

'cde'

In [179]:
df_infobox = pd.read_csv('infobox.csv')
all_infobox_types = list(df_infobox.infobox_type)
del df_infobox
all_infobox_types

['Infobox Buddha',
 'Infobox Christian leader',
 'Infobox clergy',
 'Infobox Jewish leader',
 'Infobox Latter Day Saint biography',
 'Infobox rebbe',
 'Infobox religious biography',
 'Infobox saint',
 'Infobox Egyptian dignitary',
 'Infobox noble',
 'Infobox peer',
 'Infobox pharaoh',
 'Infobox pretender',
 'Infobox royalty',
 'Infobox college football player',
 'Infobox CFL biography',
 'Infobox NFL biography',
 'Infobox baseball biography',
 'Infobox basketball biography',
 'Infobox Champ Car driver',
 'Infobox F1 driver',
 'Infobox Le Mans driver',
 'Infobox Motocross rider',
 'Infobox motorcycle rider',
 'Infobox NASCAR driver',
 'Infobox racing driver',
 'Infobox racing driver series section',
 'Infobox speedway rider',
 'Infobox WRC driver',
 'Infobox sportsperson',
 'Infobox biathlete',
 'Infobox boxer (amateur)',
 'Infobox climber',
 'Infobox professional bowler',
 'Infobox sailor',
 'Infobox speed skater',
 'Infobox sport wrestler',
 'Infobox swimmer',
 'Infobox bullfighting c

In [2]:
%%time

# with open('test.xml', 'r', encoding='utf-8') as f:
with open('enwiki-20190520-pages-articles-multistream1.xml', 'r', encoding='utf-8') as f:
    lines = f.readlines()

Wall time: 3.7 s


In [3]:
len(lines)

4597155

In [4]:
lines[:10]

['<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en">\n',
 '  <siteinfo>\n',
 '    <sitename>Wikipedia</sitename>\n',
 '    <dbname>enwiki</dbname>\n',
 '    <base>https://en.wikipedia.org/wiki/Main_Page</base>\n',
 '    <generator>MediaWiki 1.34.0-wmf.5</generator>\n',
 '    <case>first-letter</case>\n',
 '    <namespaces>\n',
 '      <namespace key="-2" case="first-letter">Media</namespace>\n',
 '      <namespace key="-1" case="first-letter">Special</namespace>\n']

In [208]:
# def get_content_within_paired(s, open_bracket='([{', close_bracket='}])'):
    
    
def extract_infoboxes(lines):
    infoboxes = []
    infobox = []
    bracket_stack = 0
    cur_line_is_infobox = False
    
    for l in lines:
        l = l.strip()
        if not cur_line_is_infobox and l.strip()[:9] == '{{Infobox' and l[2:].strip() in all_infobox_types:
            cur_line_is_infobox = True
        if cur_line_is_infobox:
            bracket_stack = bracket_stack + len(re.findall('{{', l)) - len(re.findall('}}', l))
            infobox.append(l)
            if bracket_stack <= 0:
                cur_line_is_infobox = False
                infoboxes.append(infobox)
                infobox = []
                bracket_stack = 0
    return infoboxes

In [209]:
%%time
infoboxes = extract_infoboxes(lines)

Wall time: 2.17 s


In [213]:
test_line = lines[3976260:3976325]

In [214]:
test_ifbs = extract_infoboxes(test_line)

In [216]:
test_ifbs

[['{{Infobox religious biography',
  '| title = [[Grand Ayatollah]]',
  '| embed             = yes',
  '| religion         = [[Islam]]',
  '| denomination      = [[Twelver]] [[Shia Islam|Shīʿā]]&lt;ref&gt;{{cite book|editor1-last=Bowering|editor1-first=Gerhard|editor2-last=Crone|editor2-first=Patricia|editor3-last=Kadi|editor3-first=Wadad|editor4-last=Stewart|editor4-first=Devin J.|editor5-last=Zaman|editor5-first=Muhammad Qasim|editor6-last=Mirza|editor6-first=Mahan|title=The Princeton Encyclopedia of Islamic Political Thought|date=28 November 2012|publisher=Princeton University Press|isbn=9781400838554|page=518}}&lt;/ref&gt;&lt;ref&gt;{{cite book|author1=Malise Ruthven|title=Fundamentalism: The Search For Meaning: The Search For Meaning|date=8 April 2004|publisher=Oxford University Press|isbn=9780191517389|page=29|edition=reprint}}&lt;/ref&gt;&lt;ref&gt;{{cite book|editor1-last=Jebnoun|editor1-first=Noureddine|editor2-last=Kia|editor2-first=Mehrdad|editor3-last=Kirk|editor3-first=Mim

# Conditions to handle
- 1.

```
{{Infobox president
| name          = Ruhollah Khomeini
| native_name   = {{lang|fa|سید روح‌الله موسوی خمینی}}
...
| signature     = Ruhollah Khomeini signature.png
| module        =
    {{Infobox religious biography
    | title = [[Grand Ayatollah]]
    | embed             = yes
    | religion         = [[Islam]]
...
```


In [194]:
%%time

bios = []
bio = []

bracket_stack = 0
cur_line_is_infobox = False
for l in lines:
    if not curl.strip()[:9] == '{{Infobox' and l.strip()[2:].strip() in all_infobox_types:
        cur_line_is_infobox = True
    if cur_line_is_infobox:
        bracket_stack = bracket_stack + len(re.findall('{{', l)) - len(re.findall('}}', l))
        bio.append(l.replace(' |', '|'))
        if bracket_stack <= 0:
            cur_line_is_infobox = False
            bios.append(bio)
            bio = []

Wall time: 2.15 s


In [196]:
for l in lines:
    if l.strip()[:9] == '{{Infobox' and l.strip()[2:].strip() in all_infobox_types and not l[2:].strip() in all_infobox_types:
        print(l)

 {{Infobox writer 

  {{Infobox musical artist

    {{Infobox religious biography



In [199]:
for i, l in enumerate(lines):
    if '{{Infobox religious biography' in l and l[:4] == '    ':
        print(i, l)

3976296     {{Infobox religious biography



In [217]:
print(''.join(lines[3976250:3976325]))

    <revision>
      <id>898020855</id>
      <parentid>898020756</parentid>
      <timestamp>2019-05-20T21:20:23Z</timestamp>
      <contributor>
        <username>GeneralizationsAreBad</username>
        <id>24080262</id>
      </contributor>
      <comment>Reverted 1 pending edit by [[Special:Contributions/67.226.221.181|67.226.221.181]] to revision 897942996 by HistoryofIran: Wrong place and not clear</comment>
      <model>wikitext</model>
      <format>text/x-wiki</format>
      <text xml:space="preserve">{{pp-pc|small=yes}}
{{short description|20th-century Iranian religious leader and politician}}
{{Distinguish|Ali Khamenei}}
{{Other people|Khomeini|Khomeini (name)}}
{{Use dmy dates|date=October 2012}}
{{Infobox president
| name          = Ruhollah Khomeini
| native_name   = {{lang|fa|سید روح‌الله موسوی خمینی}}
| honorific-prefix = [[Grand Ayatollah]]
| image         = File:Ruhollah Khomeinii.jpg
| caption       = 
| birth_date    = {{birth date|1902|9|24|df=y}}&lt;ref name=&quo

In [195]:
len(bios)

1631

In [210]:
len(infoboxes)

1631

In [166]:
import html

In [170]:
s = 'Alberto de Acha &lt;br&gt;&lt;small&gt;(maternal grandfather)&lt;/small&gt;&lt;ref name=&quot;Arnaz&quot;&gt;Arnaz, Desi. ''A Book''. New York: William Morrow, 1976. {{ISBN|0688003427}}&lt;/ref&gt;'

In [171]:
html.unescape(s)

'Alberto de Acha <br><small>(maternal grandfather)</small><ref name="Arnaz">Arnaz, Desi. A Book. New York: William Morrow, 1976. {{ISBN|0688003427}}</ref>'

In [14]:
del lines

In [17]:
%%time

cleaned_bios = []
for b in bios:
    kvs = ''.join(b).split('\n|')
    kvs[0] = kvs[0].replace('{{', 'infobox_type = ')
    kvs[-1] = kvs[-1].replace('}}', '')
    cleaned_kvs = [kv.strip() for kv in kvs]
    cleaned_bios.append(cleaned_kvs)

Wall time: 75.8 ms


In [42]:
_ = []
for i, bio in enumerate(cleaned_bios):
    for item in bio:
        if "Russell's theory of causal lines&lt;ref&gt;{{cite book|url" in item:
            _ = bio

In [43]:
_

['infobox_type = Infobox philosopher',
 'name             = &lt;small&gt;[[The Right Honourable]]&lt;/small&gt;&lt;br /&gt;The Earl Russell',
 'honorific_suffix = {{post-nominals|country=GBR|OM|FRS|size=100%}}',
 'image            = Bertrand Russell transparent bg.png',
 'birth_name       = Bertrand Arthur William Russell',
 'birth_date       = {{birth date|df=yes|1872|5|18}}',
 "birth_place      = [[Trellech]], [[Monmouthshire (historic)|Monmouthshire]], United Kingdom{{efn|Monmouthshire's Welsh status was  ambiguous at this time, and was considered by some to be part of England. See [[Monmouthshire (historic)#Ambiguity over status]].}}&lt;!--Whether Monmouthshire was in Wales in 1872 is debatable. Please leave this alone; this page is not the place for this debate--&gt;",
 'death_date       = {{Death date and age|df=yes|1970|2|2|1872|5|18}}',
 'death_place      = [[Penrhyndeudraeth]], [[Caernarfonshire]], Wales, United Kingdom',
 'nationality      = British &lt;!-- This is the consen

In [20]:
'|'.isalpha()

False

In [21]:
len(cleaned_bios)

1629

In [110]:
%%time

k = v = ''
lst_dic = []
for i, bio in enumerate(cleaned_bios):
#     if i % 200 == 0:
#         print(i)
    dic = {}
    for item in bio:
        kv = item.split('=')
        #certainly this can be done with just a simple regular expression, but that will use a lot more memory. I'm using a laptop...
        if not item \
        or not item[0].isalpha() \
        or not '=' in item \
        or not kv[0].strip().replace('_', '').replace('-', '').replace(' ', '').isalpha():
            v += item
            dic[k] = v
            continue
        k, v = kv[0].strip(), kv[1].strip()
        dic[k] = v
    lst_dic.append(dic)

Wall time: 135 ms


In [111]:
len(lst_dic)

1629

In [112]:
all_keys = []
for dic in lst_dic:
    all_keys += list(dic.keys())

In [113]:
all_keys_set = set(all_keys)

In [114]:
[_ for _ in all_keys_set if len(_) > 20]

['type_species_authority',
 'resting_place_coordinates',
 'FrenchOpenDoublesresult',
 'ru_sevensnationalteam',
 'GDP_nominal_per_capita_rank',
 'laurenceolivierawards',
 'pushpin_label_position',
 'afterburning thrust main',
 'highestdoublesranking',
 'along with the sultan',
 'max takeoff weight alt',
 'WTAChampionshipsDoublesresult',
 'population_density_rank',
 'significant_buildings',
 'OlympicMixedDoublesresult',
 'GDP_nominal_per_capita',
 'currentdoublesranking',
 'population_census_year',
 'identification_symbol_label',
 'elimination_half-life',
 'AustralianOpenDoublesresult',
 'WimbledonDoublesresult',
 'GDP_PPP_per_capita_rank',
 'goldenraspberryawards',
 'identification_symbol',
 'FrenchOpenMixedresult',
 'grandparent_authority',
 'CoachTournamentRecord',
 'conventional_long_name',
 'author_abbreviation_zoo',
 'medaltemplates-expand',
 'restingplacecoordinates',
 'colonel_of_the_regiment',
 'max takeoff weight main',
 'population_density_sq_mi',
 'never exceed speed main',
 

In [115]:
df = pd.DataFrame(lst_dic)

In [116]:
df.shape

(1629, 1478)

In [117]:
df_origin = df.copy()

In [62]:
df_origin.to_excel('tmp.xlsx', 'data', index=False)

In [118]:
null_count = df.isnull().sum()
dic_null_count = null_count.to_dict()

In [119]:
limit_null_rate = .95
limit_null_count = int(df.shape[0] * limit_null_rate)

In [120]:
limit_null_count

1547

In [121]:
for k, v in dic_null_count.items():
    if v > limit_null_count:
        df.drop(k, axis=1, inplace=True)

In [122]:
df.shape

(1629, 77)

In [123]:
df.to_excel('tmp1.xlsx', 'data', index=False)

In [125]:
df_origin.module.value_counts()

{{Infobox musical artist|embed                                                                     20
{{Infobox military person|embed                                                                    15
{{Infobox musical artist| embed                                                                     6
{{Infobox musical artist                                                                            6
                                                                                                    6
{{Infobox scientist| embed                                                                          3
{{Infobox philosopher|embed                                                                         3
{{Listen                                                                                            3
{{Infobox football biography                                                                        2
{{Infobox Arabic name|embed                                                       