In [1]:
import numpy as np
import pandas as pd

In [5]:
# read in headers file by chunks of 100000 to conserve memory
# https://stackoverflow.com/questions/25962114/how-to-read-a-6-gb-csv-file-with-pandas
tp = pd.read_csv('enwiki_20161001_headings_2.tsv', sep='\t', header=0, dtype={'page_id': np.int32, 'page_title': object, 'page_ns': np.int16, 'heading_level': np.int8, 'heading_text': object}, error_bad_lines=False)

b'Skipping line 9014122: expected 5 fields, saw 8\n'
b'Skipping line 24159274: expected 5 fields, saw 8\n'


In [6]:
tp.head()

Unnamed: 0,page_id,page_title,page_ns,heading_level,heading_text
0,3046517,Articles for deletion/Domotic maid,4,3,[[Domotic maid]]
1,3046527,Bernard Fisher,0,2,People
2,3046527,Bernard Fisher,0,2,Other
3,3046529,Gunpowder Incident,0,2,Background
4,3046529,Gunpowder Incident,0,2,Removing the gunpowder


In [7]:
# create a new dataframe of only rows where page_ns == 0
en_DF = tp[tp['page_ns'] == 0]

In [8]:
en_DF.head()

Unnamed: 0,page_id,page_title,page_ns,heading_level,heading_text
1,3046527,Bernard Fisher,0,2,People
2,3046527,Bernard Fisher,0,2,Other
3,3046529,Gunpowder Incident,0,2,Background
4,3046529,Gunpowder Incident,0,2,Removing the gunpowder
5,3046529,Gunpowder Incident,0,2,Aftermath


In [9]:
en_DF.page_ns.unique()

array([0])

In [10]:
# determine number of unique articles
len(en_DF.page_title.unique())

4925071

In [11]:
# remove leading and trailing whitespace from heading_text column
en_DF['heading_text'] = pd.core.strings.str_strip(en_DF['heading_text'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [12]:
en_DF.head()

Unnamed: 0,page_id,page_title,page_ns,heading_level,heading_text
1,3046527,Bernard Fisher,0,2,People
2,3046527,Bernard Fisher,0,2,Other
3,3046529,Gunpowder Incident,0,2,Background
4,3046529,Gunpowder Incident,0,2,Removing the gunpowder
5,3046529,Gunpowder Incident,0,2,Aftermath


In [14]:
# groupby heading_text and count the number of unique page_titles each heading appears in
# sort in descencing order
# this returns a pandas series object
article_count = en_DF.groupby('heading_text')['page_title'].apply(lambda x: len(x.unique())).sort_values(ascending=False)

In [15]:
# turn pandas series object into pandas dataframe
en_article_count_DF = pd.DataFrame({'section_title':article_count.index, 'number_of_articles':article_count.values})

In [16]:
en_article_count_DF.head()

Unnamed: 0,number_of_articles,section_title
0,4100059,References
1,2328065,External links
2,1128337,See also
3,530148,History
4,281307,Notes


In [17]:
# add a column for the percentage of articles that header appears in
en_article_count_DF['article_percentage'] = (en_article_count_DF['number_of_articles']/4925071)*100

In [18]:
# set pandas options to display 100 rows
# round percentage to 2 decimal places and show top 100 results
pd.options.display.max_rows = 100
en_article_count_DF.round({'article_percentage': 2}).head(100)

Unnamed: 0,number_of_articles,section_title,article_percentage
0,4100059,References,83.25
1,2328065,External links,47.27
2,1128337,See also,22.91
3,530148,History,10.76
4,281307,Notes,5.71
5,174740,Career,3.55
6,151578,Biography,3.08
7,147026,Further reading,2.99
8,144583,Track listing,2.94
9,121110,Bibliography,2.46
