In [3]:
import re
import os
import arrow
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm_notebook
from IPython.display import display, HTML, FileLink, clear_output

In [6]:
# Location of downloaded XML files
data_dir = '/Volumes/bigdata/mydata/Hansard/xml/'

def get_ids_from_csv(csv_file):
    df = pd.read_csv(csv_file, parse_dates=['date'])
    ids = []
    for url in list(df['speech_url']):
        # print(url)
        ids.append(re.search(r'\/(\d{8}_(reps|senate|REPS|SENATE)_\d+_\w+)\/', url).group(1))
    ids = list(set(ids))
    return ids

def get_texts(csv_file):
    ids = get_ids_from_csv(csv_file)
    for id in ids:
        year = id[:4]
        date = arrow.get(id[:8], 'YYYYMMDD').format('YYYY-MM-DD')
        house = re.search(r'\d{8}_((reps|senate|REPS|SENATE))_\w+', id).group(1).lower()
        if house == 'reps':
            house = 'hofreps'
        file_path = os.path.join(data_dir, house, year, '{}.xml'.format(id))
        yield file_path
        
def get_total_files(csv_file):
    '''
    Get the total number of files.
    '''
    return len(get_ids_from_csv(csv_file))
        
def get_words(string, direction, window):
    word_list = re.findall(r'\w+', string)
    try:
        if direction == 'before':
            words = word_list[0-window:]
        elif direction == 'after':
            words = word_list[:window]
    except IndexError:
        words = word_list
    return words

def get_contexts(filename, term, window):
    '''
    Although it seems weird to match then tokenise, then get surrounding words,
    this seems much quicker than using re to get specific numbers of words.
    Using TextBlob for tokenisation seems similar, but unnecessary?
    What about just getting all the ngrams and filtering to those with the target in the middle?
    '''
    contexts = []
    with open(filename, 'r') as text_file:
        content = text_file.read()
        soup = BeautifulSoup(content, 'lxml')
        for para in soup.find_all('para', string=re.compile(r'\b{}\b'.format(term), re.IGNORECASE)):
            content = para.get_text()
            matches = re.finditer(r'\b{}\b'.format(term), content, re.IGNORECASE)
            for match in matches:
                start = match.start()
                end = match.end()
                # Get KWIC string
                kwic = content[start - 50:end + 50]
                # Get lists of words before and after the term
                before = get_words(kwic[:50], 'before', window)
                after = get_words(kwic[-50:], 'after', window)
                contexts.append((kwic, before, after))
    return contexts

def save_as_csv(df, term, window):
    '''
    Save the results as a CSV.
    Convert lists of words into a pipe-separated string.
    '''
    df2 = df.copy()
    df2['before'] = df['before'].str.join(sep='|')
    df2['after'] = df['after'].str.join(sep='|')
    df2 = df2[['id', 'date', 'kwic', 'before', 'after']]
    filename = 'hansard-{}-words-{}.csv'.format(term, window)
    df2.to_csv(filename, index=False)
    display(FileLink(filename))

def get_all_contexts(csv_file, term, window=5):
    all_contexts = []
    total = get_total_files(csv_file)
    for text in tqdm_notebook(get_texts(csv_file), total=total):
        date = arrow.get(os.path.basename(text)[:8], 'YYYYMMDD').format('YYYY-MM-DD')
        house = re.search(r'\d{8}_((reps|senate|REPS|SENATE))_\w+', text).group(1).lower()
        contexts = get_contexts(text, term, window)
        for context in contexts:
            all_contexts.append({'id': os.path.basename(text), 'date': date, 'house': house, 'kwic': context[0], 'before': context[1], 'after': context[2]})
    df = pd.DataFrame(all_contexts)
    save_as_csv(df, term, window)

In [7]:
get_all_contexts('hansard_aliens.csv', 'aliens', window=5)

HBox(children=(IntProgress(value=0, max=1237), HTML(value='')))




In [8]:
get_all_contexts('hansard_immigrants.csv', 'immigrants', window=5)

HBox(children=(IntProgress(value=0, max=2751), HTML(value='')))


