In [1]:
import os 
import sys
import pandas as pd
import seaborn 
import pathlib
import dateutil
import pytz
import json

%matplotlib inline

# Data descriptions and storage

All data that were collected come from en.wikipedia. A description of the data and how it is stored is detailed below:

**1. Revision history of pages with metadata from ~June 2015 - ~July 2018**

* Description of data
* Metadata includes: 'page_title', 'revid', 'parentid', 'user', 'userid', 'timestamp', 'comment', 'character_count',   'word_count', 'external_link_count', 'heading_count', 'wikifile_count', 'wikilink_count'
* Data are stored in individual CSV files for each page - with each row corresponding registered revision
* There is a unique CSV for 
    
**2. Historical wiki link data for each page**
    
* Storage: single layer dictionary stored as a .json for each page
* Storage Description: Keys are the title of a wikipage this linked to by the titular wikipage, value is a list of all revision IDs of the titular page which contain that wikilink
    
**3. Daily page views for each page from ~June 2015 - ~July 2018**

* Data consists of 'page_title', 'timestamp', 'page_views'
* Data is stored in a single CSV file
    
**4. Backlinks for each wikipedia page from a single day ~July 2018**

* Description of data: This data enumarates all of the pages on wikpedia which point to a specific page
* Storage: Single .json file 
* Storage description: .json dicitionary key is the wikipage and value is a list of tuples (page_ID, page_title) which each describe a single page which points to to this page
    


# Importing and Cleaning Data

## Revision history
### Importing and datetime conversion

In [2]:
path_to_revisions = pathlib.Path('../data/test15/revisions/')

Some revisions which were collected were of old/dead/defunct pages. This means that they have very few versions. To excude these we only consider files which are larger than 200 bytes. 

In [4]:
df_from_each_file = (pd.read_csv(f, encoding='UTF-16') for f in path_to_revisions.iterdir() if f.lstat().st_size>200)
concatenated_df   = pd.concat(df_from_each_file)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [5]:
# Parsing out junk headers

concatenated_df = concatenated_df[['character_count',
                                   'comment', 'external_link_count', 'heading_count', 'page_title',
                                   'parentid', 'revid', 'timestamp', 'user', 'userid', 'wikifile_count',
                                   'wikilink_count', 'word_count']]

In [6]:
# consolidating each revision entry together

concatenated_df = concatenated_df[-concatenated_df.timestamp.apply(lambda x: type(x) == float)]

In [7]:
# Converting timestamp data into the correct form

concatenated_df = concatenated_df.reset_index(drop=True)
concatenated_df.timestamp = concatenated_df.timestamp.apply(lambda x: dateutil.parser.DEFAULTPARSER.parse(x))
concatenated_df.timestamp = pd.to_datetime(concatenated_df.timestamp)

In [8]:
concatenated_df.sort_values(by=['page_title', 'timestamp'], inplace=True)
concatenated_df = concatenated_df.set_index('timestamp')
concatenated_df['index'] = [r for r in range(len(concatenated_df))]

In [9]:
len(concatenated_df)

2792456

### Removing all but the latest edit a user made in a single day

There are many cases where many revisions are being done by a single editor in a single day. These will skew pages as having much higher numbers of edits. Here we'll reduce those numbers

In [10]:
test = (df['index'][:-1] for (page, date, user), df in concatenated_df.groupby(['page_title', pd.Grouper(freq='D'), 'user'])
                    if len(df) > 1)

concatenated_df.reset_index(inplace=True)
concatenated_df = concatenated_df.set_index('index').drop(pd.concat(test))

In [11]:
concatenated_df.to_csv('../data/test15/cleaned/revision_data.csv', chunksize= 1000000)

## Pageview data

In [12]:
pageview = pd.read_csv('../data/test15/pageview_data.csv')

# Converting the timestamp data that was provided by wikipeida into a pandas friendly format

test = pageview['Unnamed: 0'].apply(lambda x: dateutil.parser.DEFAULTPARSER.parse(x))
test = test.apply(lambda x: x.replace(tzinfo=pytz.utc))
pageview['datetime'] = test
pageview = pageview.drop(['Unnamed: 0'], axis=1)

In [13]:
# Reshaping pandas dataframe

pageview = pd.melt(pageview, id_vars='datetime', value_vars=pageview.columns[:-1])
pageview.columns = ['datetime', 'page_title', 'views']
pageview.page_title = pageview.page_title.str.replace('_', ' ')


In [14]:
print('Number of pages pre cleaning: {0}'.format(len(pageview.page_title.unique())))

Number of pages pre cleaning: 12619


In [15]:
# We'll only consider pages which have more than 1/2 year worth of page view entries
# If there is a large set of NaN entries the page is probably fake

pageview = pageview.groupby('page_title').filter(lambda x: sum(x['views'].isna()) < len(x['views']) - 183)

In [16]:
print('Number of pages post cleaning: {0}'.format(len(pageview.page_title.unique())))

Number of pages post cleaning: 9788


In [17]:
pageview.columns = ['timestamp', 'page_title', 'page_views']

In [19]:
# Dumping cleaned data

pageview.to_csv('../data/test15/cleaned/pageview.csv', chunksize= 1000000)

## Combining pageview and revision history data frames

In [20]:
print('Number of pages in revision data: {0}'.format(len(concatenated_df.page_title.unique())))
print('Number of pages pageview data: {0}'.format(len(pageview.page_title.unique())))

list_of_pages = set(pageview.page_title) & set(concatenated_df.page_title)
print('Number of pages common to both: {0}'.format(len(list_of_pages)))

Number of pages in revision data: 10771
Number of pages pageview data: 9788
Number of pages common to both: 8722


In [21]:
# Making sure these datetimes are in the right position

concatenated_df.timestamp = pd.to_datetime(concatenated_df.timestamp)
pageview.timestamp = pd.to_datetime(pageview.timestamp)

In [22]:
# Subsetting datasets to combine

concatenated_df = concatenated_df[concatenated_df.page_title.isin(list_of_pages)]
pageview = pageview[pageview.page_title.isin(list_of_pages)]

In [23]:
concatenated_df.set_index(['page_title', 'timestamp'], inplace=True)
pageview.set_index(['page_title', 'timestamp'], inplace=True)

In [24]:
len(concatenated_df)

1517283

In [25]:
len(pageview)

9629088

In [27]:
consolidated_historical_data = pd.concat([pageview, concatenated_df])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [28]:
consolidated_historical_data = consolidated_historical_data.sort_index(level=1).sort_index(level=0)

In [29]:
len(consolidated_historical_data)

11146371

In [30]:
consolidated_historical_data.to_csv('../data/test15/cleaned/combined.csv', chunksize=1000000)

## Backlink data

In [31]:
path_to_pagelinks = pathlib.Path('../data/test15/links_pointing_to_pages.json')
with open(path_to_pagelinks, 'r') as json_to_read:
    path_to_page_links = json.load(json_to_read)

In [3]:
# There are many kinds of d

count_dictionary = dict()
for key, pointers in path_to_page_links.items():
    count_dictionary[key] = {'talk':0,
                             'wiki_links': 0, 
                             'wikipedia' : 0,
                             'user' : 0}
    for point in pointers:
        if 'talk:' in point[1]:
            count_dictionary[key]['talk'] += 1
        elif 'wikipedia:' in point[1]:
            count_dictionary[key]['wikipedia'] += 1
        elif 'user:' in point[1]:
            count_dictionary[key]['user'] += 1
        else:
            count_dictionary[key]['wiki_links'] += 1
    

In [4]:
pointers_df = pd.DataFrame.from_dict(count_dictionary, orient='index')
pointers_df.reset_index(inplace=True)
pointers_df.columns = ['page_title', 'talk_link_count', 'wiki_link_count', 'wikipeida_count', 'user_count']


In [6]:
pointers_df.to_csv(pathlib.Path('../data/test15/cleaned/pointers.csv'))

# Getting categories

In [43]:
revision_data = pd.read_csv('../data/test15/cleaned/revision_data.csv')

In [44]:
revision_data = revision_data.set_index('timestamp')

In [45]:
current_revisions = dict((page, {'revid' :df.iloc[0]['revid'],
                                 'category': []}) for page, df in revision_data.groupby('page_title'))

In [46]:
# This goes through every .json file of wikilinks iterates through each page that was 
# at one point pointed to. Determines if it is a Category: link (easy as each category link has that string in it).
# And then only pulls out those which are pointed to by the current revision of the article

path_to_json = pathlib.Path('../data/test15/wikilinks/')
for path in path_to_json.iterdir():
    page_name = path.name.split('.json')[0]
    try:
        with open(path, 'r') as to_open:
            temp_file = json.load(to_open)
    except:
        continue
    try:
        revision = current_revisions[page_name]['revid']
    except:
        continue
    keys = (key for key in temp_file.keys() if 'Category:' in key)
    for key in keys:
        if int(revision) in temp_file[key]:
            current_revisions[page_name]['category'].append(key.split('gory:')[1])
#             print(key)
#     links = temp_file[revision]


In [47]:
# Converting the dictionary into a dataframe

categories = pd.DataFrame.from_dict(current_revisions, orient='index')
categories.reset_index(inplace=True)
categories.columns = ['page_title', 'revid', 'categories']

In [48]:
# Dumping the dataframe

categories.to_csv('../data/test15/cleaned/category_data.csv')