In [1]:
import re
import os
import arrow
import pandas as pd
from tqdm import tqdm_notebook
from IPython.display import display, HTML, FileLink, clear_output

In [2]:
def get_texts(harvest):
    data_dir = os.path.join('data', harvest, 'text')
    for file in os.listdir(data_dir):
        if file[-4:] == '.txt':
            yield os.path.join(data_dir, file)
            
def get_total_files(harvest):
    '''
    Get the total number of files.
    '''
    data_dir = os.path.join('data', harvest, 'text')
    return len([f for f in os.listdir(data_dir) if f[-4:] == '.txt'])


def get_words(string, direction, window):
    word_list = re.findall(r'\w+', string)
    try:
        if direction == 'before':
            words = word_list[0-window:]
        elif direction == 'after':
            words = word_list[:window]
    except IndexError:
        words = word_list
    return words

def get_contexts(filename, term, window):
    '''
    Although it seems weird to match then tokenise, then get surrounding words,
    this seems much quicker than using re to get specific numbers of words.
    Using TextBlob for tokenisation seems similar, but unnecessary?
    What about just getting all the ngrams and filtering to those with the target in the middle?
    '''
    contexts = []
    with open(filename, 'r') as text_file:
        content = text_file.read().replace('\n', ' ')
        matches = re.finditer(r'\b{}\b'.format(term), content, re.IGNORECASE)
        for match in matches:
            start = match.start()
            end = match.end()
            # Get KWIC string
            kwic = content[start - 50:end + 50]
            # Get lists of words before and after the term
            before = get_words(kwic[:50], 'before', window)
            after = get_words(kwic[-50:], 'after', window)
            contexts.append((kwic, before, after))
    return contexts

def save_as_csv(df, term, window, harvest):
    '''
    Save the results as a CSV.
    Convert lists of words into a pipe-separated string.
    '''
    df2 = df.copy()
    df2['before'] = df['before'].str.join(sep='|')
    df2['after'] = df['after'].str.join(sep='|')
    df2 = df2[['article_id', 'newspaper_id', 'date', 'kwic', 'before', 'after']]
    filename = 'newspapers-{}-{}-words-{}.csv'.format(harvest, term, window)
    df2.to_csv(filename, index=False)
    display(FileLink(filename))

def get_all_contexts(harvest, term, window=2):
    all_contexts = []
    total = get_total_files(harvest)
    for filename in tqdm_notebook(get_texts(harvest), total=total):
        date, newspaper_id, article_id = os.path.basename(filename)[:-4].split('-')
        date = arrow.get(date, 'YYYYMMDD').format('YYYY-MM-DD')
        contexts = get_contexts(filename, term, window)
        for context in contexts:
            all_contexts.append({'article_id': article_id, 'newspaper_id': newspaper_id, 'date': date, 'kwic': context[0], 'before': context[1], 'after': context[2]})
    df = pd.DataFrame(all_contexts)
    save_as_csv(df, term, window, harvest)

In [3]:
get_all_contexts('1548377003', 'immigrants', window=5)

HBox(children=(IntProgress(value=0, max=505296), HTML(value='')))




In [4]:
get_all_contexts('1548372464', 'aliens', window=5)

HBox(children=(IntProgress(value=0, max=187954), HTML(value='')))


