In [1]:
import numpy as np
import pandas as pd

In [2]:
# read in headers file by chunks of 100000 to conserve memory
# https://stackoverflow.com/questions/25962114/how-to-read-a-6-gb-csv-file-with-pandas
tp = pd.read_csv('frwiki_20161001_headings_3.tsv', sep='\t', header=0, dtype={'page_id': np.int32, 'page_title': object, 'page_ns': np.int16, 'heading_level': np.int8, 'heading_text': object}, iterator=True, chunksize=100000)

In [3]:
# concatenate all rows where page_ns is equal to zero (article namespace) into a pandas dataframe
fr_DF = pd.concat([chunk[chunk['page_ns'] == 0] for chunk in tp])

In [4]:
fr_DF.head()

Unnamed: 0,page_id,page_title,page_ns,heading_level,heading_text
0,412304,Dragon Head (film),0,2,Synopsis
1,412304,Dragon Head (film),0,2,Fiche technique
2,412304,Dragon Head (film),0,2,Distribution
3,412304,Dragon Head (film),0,2,Distinctions
4,412304,Dragon Head (film),0,2,Voir aussi


In [5]:
fr_DF.page_ns.unique()

array([0])

In [6]:
# determine number of unique articles
len(fr_DF['page_title'].unique())

1651669

In [7]:
# remove leading and trailing whitespace from heading_text column
fr_DF['heading_text'] = pd.core.strings.str_strip(fr_DF['heading_text'])

In [8]:
# groupby heading_text and count the number of unique page_titles each heading appears in
# sort in descencing order
# this returns a pandas series object
article_count = fr_DF.groupby('heading_text')['page_title'].apply(lambda x: len(x.unique())).sort_values(ascending=False)

In [9]:
# turn pandas series object into pandas dataframe
fr_article_count_DF = pd.DataFrame({'section_title':article_count.index, 'number_of_articles':article_count.values})

In [10]:
# add a column for the percentage of articles that header appears in
fr_article_count_DF['article_percentage'] = (fr_article_count_DF['number_of_articles']/1651669)*100

In [11]:
# set pandas options to display 100 rows
# round percentage to 2 decimal places and show top 100 results
pd.options.display.max_rows = 100
fr_article_count_DF.round({'article_percentage': 2}).head(100)

Unnamed: 0,number_of_articles,section_title,article_percentage
0,784431,Notes et références,47.49
1,726584,Liens externes,43.99
2,425307,Voir aussi,25.75
3,333309,Références,20.18
4,326686,Articles connexes,19.78
5,286614,Biographie,17.35
6,230732,Histoire,13.97
7,228681,Bibliographie,13.85
8,152555,Lien externe,9.24
9,119083,Géographie,7.21
