In [None]:
from bs4 import BeautifulSoup
import json
import re
import urllib2
import datetime
from os import walk

### Example of URL we have to find:
https://en.wikipedia.org/wiki/Talk:Antifa_(United_States)/Archive_4#Request_for_comment

### The URL we will use:

https://en.wikipedia.org/w/index.php?search=RfC%3A_Should_%7B%7Binfobox_criminal%7D%7D_be_used_in_this_article%3F+prefix%3ATalk%3AAloysius+Stepinac&title=Special:Search&profile=default&fulltext=1&searchToken=mhp1jccsq7yxe23x5n6z1r2w

In [None]:
# get the file names
diff_folder_dir = "/Users/jane/rfc-analysis/diffs/"
diff_files = []
for (dirpath, dirnames, filenames) in walk(diff_folder_dir):
    diff_files.extend(filenames)

In [None]:
def find_rfc_url(rfc_info):
    rfc_anchor = rfc_info["parsed_rfc_title"]
    rfc_page_title = rfc_info["page_title"]
    
    rfc_url_candidates = find_rfc_url_from_archive(rfc_anchor, rfc_page_title)
    return rfc_url_candidates

In [None]:
def find_rfc_url_from_archive(rfc_anchor, rfc_page_title):
    """ make sure to check for duplicates """
    url_candidates = get_rfc_url_candidates(rfc_anchor, rfc_page_title)
    narrowed_url_candidates = set()
    if len(url_candidates) > 0:
        # make sure to cover cases when there are multiple candidates
        for c in url_candidates:
            anchor_section = c.split("#")[-1]
#             print anchor_section + '- ' + rfc_anchor
            if anchor_section.strip() == rfc_anchor.strip():
                # include only the exact same ones
                narrowed_url_candidates.add(c.strip())
                return list(narrowed_url_candidates)

In [None]:
def get_rfc_url_candidates(rfc_anchor, rfc_page_title):
    archive_search_url = "https://en.wikipedia.org/w/index.php?search=" + rfc_anchor.strip() + "&prefix=" + rfc_page_title + "&title=Special:Search&profile=default&fulltext=1"
    archive_search_url = archive_search_url.replace(" ", "_").encode('utf-8') # have to do this because of the page title
    archive_search_url = get_rid_of_links(archive_search_url)
    archive_search_page = urllib2.urlopen(archive_search_url)
    soup = BeautifulSoup(archive_search_page, 'lxml')
    try:
#         candidates = [c.find(class_='searchalttitle').a['href'].strip() for c in soup.find_all(class_='mw-search-result-heading')]
        candidates = []
        for c in soup.find_all(class_='mw-search-result-heading'):
#             print c
            if c.find(class_='searchalttitle') is not None:
                candidates.append(c.find(class_='searchalttitle').a['href'].strip())
        return candidates
    except Exception:
        return []

In [None]:
def get_rid_of_links(keyword):
    # function for handling cases like ==Proposed redirect to [[Occupy Wall Street#Anthony Bologna]]==
    # https://en.wikipedia.org/wiki/Talk:Pepper_spraying_of_the_Occupy_Wall_Street_demonstrators#Proposed_redirect_to_Occupy_Wall_Street#Anthony_Bologna
    keyword = keyword.replace("[[", "").replace("]]","")
    return keyword

In [None]:
get_rid_of_links("==Proposed redirect to [[Occupy Wall Street#Anthony Bologna]]==")

function that iterates over the files and tries to find the corresponding url

In [None]:
def find_batch_urls(rfc_files, folder_dir, single_url_found, multiple_url_found, url_unfound, file_name_to_replace):
    for i, file_name in enumerate(rfc_files):
        if ".json" in file_name:
            rfc_id = file_name.replace(file_name_to_replace, "").replace(".json", "")
            if rfc_id not in single_url_found.keys() and rfc_id not in multiple_url_found.keys():
                path = folder_dir + file_name
                with open(path) as file:
                    rfc_info = json.load(file)
                if "parsed_rfc_title" in rfc_info:
                    rfc_id = rfc_info["id"]
                    try:
                        urls = find_rfc_url(rfc_info)
                        if urls is None:
                            url_unfound.add(rfc_id)
                        else:
                            if len(urls) > 1:
                                multiple_urls_found[rfc_id] = urls
                            else:
                                single_url_found[rfc_id] = urls[0]
                    except urllib2.HTTPError:
                        url_unfound.add(rfc_id)
    return single_url_found, multiple_url_found, url_unfound

### DIFFS

In [None]:
single_url_found, multiple_url_found, url_unfound = find_batch_urls(diff_files, diff_folder_dir, single_url_found, multiple_url_found, url_unfound, "diff_added_")

## Unfound retry

In [None]:
with open("/Users/jane/rfc-analysis/diff_url_unfound.json") as file:
    unfound_retry = json.load(file)

In [None]:
sf = {}
mf = {}
uf = set()
unfound_retry_files = []

for id in unfound_retry:
    unfound_retry_files.append("diff_added_" + id + ".json")

In [None]:
sf, mf, uf = find_batch_urls(unfound_retry_files, diff_folder_dir, sf, mf, uf, "diff_added_")

In [None]:
len(sf)

### Results:
#### There were no issues due to same section title. 
#### In total 6,712 new urls were found from stage 1

In [None]:
len(set(single_url_found.values()))

In [None]:
len(set(multiple_url_found.values()))

In [None]:
with open("/Users/jane/rfc-analysis/diff_url_single.json", "w") as file:
    json.dump(single_url_found, file)

In [None]:
with open("/Users/jane/rfc-analysis/diff_url_unfound.json", "w") as file:
    json.dump(list(url_unfound), file)