In [61]:
from bs4 import BeautifulSoup
import json
import re
import urllib2
import datetime
from os import walk
import csv

In [2]:
_RFC_ID = re.compile("{{rfc.*?}}", re.I)
_RFC_HEADER = re.compile(r"={2,}?([^=\t\n\r\f\v]+?)={2,}")

In [3]:
def extract_title(diff):
    candidates = []
    # first find the id
    rfcid_search = _RFC_ID.search(diff)
    if rfcid_search:
        rfcid_start = rfcid_search.start()
        # print rfcid_start
        # find possible candidates
        rfc_header_findall = _RFC_HEADER.finditer(diff)
        for m in rfc_header_findall:
            #assume that the id always is written after the title
            if rfcid_start > m.end():
                candidates.append( (rfcid_start - m.end(), m.group(0)) )
                # print '%02d-%02d: %s' % (m.start(), m.end(), m.group(0))

        candidates.sort(key=lambda t:t[0])
#         print candidates
    return candidates[0][1] if len(candidates) > 0 else None

In [113]:
def extract_rfc_titles(files, folder_dir, erase_folder_name, content_name, output_file_path):
    # content_name is the key name of the content, for example, "revision_content"
    unextracted_rfcs = set()
    title_extracted_blob = {}
    fieldnames = ['id', 'title']
    
    with open(output_file_path, 'wb') as csvfile:
        title_csv_writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        title_csv_writer.writeheader()
        for file_name in files:
            if ".json" in file_name:
                path = folder_dir + file_name
                rfc_id = file_name.replace(erase_folder_name, "").replace(".json", "").strip()

                with open(path) as file:
                    rfc_info = json.load(file)

                # content
                content = rfc_info[content_name]
                title = extract_title(content)
                if title is not None:
                    title_extracted_blob= {"id": rfc_id, "title": title.encode("utf-8")}
                    title_csv_writer.writerow(title_extracted_blob)
                else:
                    unextracted_rfcs.add(rfc_id)
    
    
    return unextracted_rfcs  

In [114]:
_OUTPUT_FILE_PATH = "/Users/jane/rfc-analysis/titles/"

In [115]:
t1 = "== Bloody Sunday (1972): Did the Bloody Sunday Inquiry find that all those shot were \"unarmed\"? ==\n\n{{rfctag|hist}}"
t2 = " \"dubious\" tag on it, is the opposite of \"facilitating discussion\". [[User:Quigley|Quigley]] ([[User talk:Quigley|talk]]) 22:07, 11 June 2011 (UTC) \n:::: Please modify only what you disagree. Don't undo anything irrelevant to Hong Kong or Macau. [http://en.wikipedia.org/w/index.php?title=Template:List_of_Asian_capitals_by_region&dir=prev&offset=20110611084749&limit=20&action=history] <== The multiple reverts were indeed disruptive. [[Special:Contributions/119.236.250.27|119.236.250.27]] ([[User talk:119.236.250.27|talk]]) 22:16, 11 June 2011 (UTC)\n\n::: I don't know what banned user are you talking about. If you disagree with the inclusion of Hong Kong and Macau, please proceed to propose to remove them. Don't break the 3RR rule by undoing all my edits (including those unrelated to Hong Kong or Macau) and restoring to your own version while I was working to improve this template. [[Special:Contributions/119.236.250.27|119.236.250.27]] ([[User talk:119.236.250.27|talk]]) 22:09, 11 June 2011 (UTC)\n\n== Hong Kong and Macau ==\n{{rfctag|pol|hist}}\n{{rfcid|78126B8}}\nThis section is started to facilitate discussion regarding the inclusion of Hong Kong and Macau. SchmuckyTheCat has made multiple disruptive reverts to undo all my edits, including those unrelated to Hong Kong or Macau, wit"

In [116]:
extract_title(t2)

'== Hong Kong and Macau =='

### Keep track of ids of revs where titles were unextracted

In [117]:
unextracted_rfcs = set()

### 1. Extract titles from diffs

In [118]:
diff_folder_dir = "/Users/jane/rfc-analysis/diffs/"
diff_files = []
for (dirpath, dirnames, filenames) in walk(diff_folder_dir):
    diff_files.extend(filenames)

In [119]:
len(diff_files)

10456

In [121]:
diff_unextracted_rfcs = extract_rfc_titles(diff_files, diff_folder_dir, "diff_added_", "diff", _OUTPUT_FILE_PATH + "diff_based.csv")

In [122]:
len(diff_unextracted_rfcs)

2157

#### So in total 2157 RfCs' titles were not extracted

### 2. All RfC

In [123]:
# get the file names
_all_rev_folder_dir = "/Users/jane/rfc-analysis/all_content/"
_all_rev_files = []
for (dirpath, dirnames, filenames) in walk(_all_rev_folder_dir):
    _all_rev_files.extend(filenames)

#### Let's only use the ids that are left

In [124]:
# first get the ids
left_files = set()
for id in diff_unextracted_rfcs:
#     print id
    file_name = "all_content_" + str(id) + ".json"
#     print file_name 
    left_files.add(file_name)

In [125]:
len(left_files)

2157

In [126]:
all_content_unextracted_rfcs = extract_rfc_titles(left_files, _all_rev_folder_dir, "all_content_", "revision_content", _OUTPUT_FILE_PATH + "revcontent_based.csv")

In [127]:
len(all_content_unextracted_rfcs)

1217

### The unextracted ones are mostly ones in the Request for Comment pages (discovered through manual work).