# DATA COLLECTION

## Import 

In [1]:
import json
import pandas as pd
import random
import numpy as np
import wikipedia
wikipedia.set_lang('en')
wikipedia.set_rate_limiting(rate_limit=True)

from tqdm import tqdm_notebook
from time import sleep
import re
import warnings
warnings.filterwarnings('ignore')

### Wikipedia articles 
Wikipedia articles were downloaded using a web service called PetScan (https://en.wikipedia.org/wiki/Wikipedia:PetScan). The two categories chosen for this assignments are `Software Engineering` and `Sport`.

In [2]:
softeng = pd.read_csv('./data/software_engineering_depth1.csv')
sports = pd.read_csv('./data/sports_depth1.csv')

## Selecting 1000 articles randomly
Since PetScan service doesn't enable to choose the number of article (it basically shows all articles from chosen category), one selects randomly 1000 articles for the following exercises. 

In [3]:
se_rdm_idx = random.sample(range(0, len(softeng)), 1000)
sp_rdm_idx = random.sample(range(0,len(sports)), 1000)
print(len(se_rdm_idx))
print(len(sp_rdm_idx))

1000
1000


In [4]:
se_1000 = softeng[softeng.index.isin(se_rdm_idx)]
se_1000 = se_1000.drop(['number','namespace','touched'], axis=1)
se_1000 = se_1000.reset_index(drop=True)
print(se_1000.shape)

(1000, 3)


In [5]:
sp_1000 = sports[sports.index.isin(sp_rdm_idx)]
sp_1000 = sp_1000.drop(['namespace','touched','number'], axis=1)
sp_1000 = sp_1000.reset_index(drop=True)
print(sp_1000.shape)

(1000, 3)


In [66]:
def normalize_name(name):
    '''
    This function enables to normalize a string by removing parenthesis and underscore.
    '''
    name = re.sub('[()]', '', name) #remove parenthesis
    name = re.sub('[_]', ' ', name) #remove underscore
    name_norm = name.lower().strip()
    return name_norm

In [7]:
sp_1000.title = sp_1000.title.apply(normalize_name)
se_1000.title = se_1000.title.apply(normalize_name)

## Retrieving content for each article

### Category:Sports articles

In [45]:
lst_contents = []
for idx, row in tqdm_notebook(sp_1000.iterrows(), leave=False):
    try:
        pagewiki = wikipedia.page(title=row['title'])
        content = pagewiki.content
        
    except wikipedia.exceptions.DisambiguationError as e:
        wikipedia.page(e.options[0]).content
    
    except:
        pagewiki = wikipedia.page(pageid=row['pageid'])
        content = pagewiki.content
    lst_contents.append(content)
    sleep(0.01)



Exception in thread Thread-5:
Traceback (most recent call last):
  File "/Users/yuyamashita/anaconda/envs/ada/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/Users/yuyamashita/anaconda/envs/ada/lib/python3.5/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/Users/yuyamashita/anaconda/envs/ada/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))




In [46]:
sp_1000['content'] = lst_contents
sp_1000.head()

Unnamed: 0,title,pageid,length,content
0,cycling,5931,54887,"Cycling, also called bicycling or biking, is t..."
1,cheerleading,6749,73813,"Cheerleading ranges from chanting, to intense ..."
2,lacrosse,18080,62729,Lacrosse is a team sport played between two te...
3,physical therapy,24022,60686,"Physical therapy (PT), mostly known as Physiot..."
4,shooting sports,28498,50932,Shooting sports is a collective group of compe...


### Category:Software engineering articles

In [58]:
lst_contents = []
for idx, row in tqdm_notebook(se_1000.iterrows(), leave=False):
    try:
        pagewiki = wikipedia.page(title=row['title'])
        content = pagewiki.content
        
    except wikipedia.exceptions.DisambiguationError as e:
        if "Boro" in e.options[0]:
            pagewiki = wikipedia.page('boro method').content
        else:
            wikipedia.page(e.options[0]).content
    
    except:
        pagewiki = wikipedia.page(pageid=row['pageid'])
        content = pagewiki.content
    lst_contents.append(content)
    sleep(0.01)



Exception in thread Thread-7:
Traceback (most recent call last):
  File "/Users/yuyamashita/anaconda/envs/ada/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/Users/yuyamashita/anaconda/envs/ada/lib/python3.5/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/Users/yuyamashita/anaconda/envs/ada/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration





In [59]:
se_1000['content'] = lst_contents

In [60]:
se_1000.head()

Unnamed: 0,title,pageid,length,content
0,acceptance testing,3233,16735,"In engineering and its various subdisciplines,..."
1,computer programming,5311,21691,Computer programming (often shortened to progr...
2,computer program,5783,28740,A computer program is a collection of instruct...
3,context-free grammar,6759,39421,"In formal language theory, a context-free gram..."
4,code coverage,7030,16777,"In computer science, code coverage is a measur..."


## Export in csv

In [61]:
se_1000.to_csv('softeng1000.csv')
sp_1000.to_csv('sports1000.csv')