## Scripts here are used for tokenization and preparing the train/dev/test sets

#### Import libraries

In [9]:
from os import listdir, getcwd
from os.path import isfile, join
from nltk.tokenize import RegexpTokenizer
import numpy as np
import re
import os

In [10]:
# These are titles, punctuations, and contractions that need to be replaced.

replacing_titles = ['Dr.','Esq.','Hon.','Jr.','Mr.','Mrs.','Ms.','Messrs.','Mmes.','Msgr.','Prof.','Rev.','Rt. Hon.','Sr.','St.']
punctuation_marks = ['"', "'", "!", "@", "#", "$", "%", "&", "*", "(", ")", "-", "?", ",", "."]
contractions_dict = {"ain't": "am not","aren't": "are not","can't": "cannot","can't've": "cannot have","'cause": "because","could've": "could have","couldn't": "could not","couldn't've": "could not have","didn't": "did not","doesn't": "does not","don't": "do not","hadn't": "had not","hadn't've": "had not have","hasn't": "has not","haven't": "have not","he'd": "he had","he'd've": "he would have","he'll": "he will","he'll've": "he will have","he's": "he is","how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how is","I'd": "I had","I'd've": "I would have","I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have","isn't": "is not","it'd": "it had","it'd've": "it would have","it'll": "it will","it'll've": "iit will have","it's": "it is","let's": "let us","ma'am": "madam","mayn't": "may not","might've": "might have","mightn't": "might not","mightn't've": "might not have","must've": "must have","mustn't": "must not","mustn't've": "must not have","needn't": "need not","needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not","oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not","shan't've": "shall not have","she'd": "she had","she'd've": "she would have","she'll": "she will","she'll've": "she will have","she's": "she is","should've": "should have","shouldn't": "should not","shouldn't've": "should not have","so've": "so have","so's": "so is","that'd": "that had","that'd've": "that would have","that's": "that is","there'd": "there had","there'd've": "there would have","there's": "there is","they'd": "they had","they'd've": "they would have","they'll": "they will","they'll've": "they will have","they're": "they are","they've": "they have","to've": "to have","wasn't": "was not","we'd": "we had","we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have","weren't": "were not","what'll": "what will","what'll've": "what will have","what're": "what are","what's": "what is","what've": "what have","when's": "when is","when've": "when have","where'd": "where did","where's": "where is","where've": "where have","who'll": "who will","who'll've": "who will have","who's": "who is","who've": "who have","why's": "why is","why've": "why have","will've": "will have","won't": "will not","won't've": "will not have","would've": "would have","wouldn't": "would not","wouldn't've": "would not have","y'all": "you all","y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you had","you'd've": "you would have","you'll": "you will","you'll've": "you will have","you're": "you are","you've": "you have"}
country_acronyms = {"U.S": "United States", "U.S.A": "United States of America", "U.A.E": "United Arab Emirates", "U.S.S.R": "Union of Soviet Socialist Republics"}

#### Defining the tokenizer

In [11]:
def tokenize_words(input_file):
    """
    This function accomplishes four purposes:
    1. Remove the title and date (the first two rows) from the input file
    2. Remove all special characters except for . and ,
    3. Convert all characters to lower case
    4. Tokenize words
    
    Args:
        input_file (str): input file
        
    Returns:
        output_file (str): tokenized strings separated by space
    """
    # Read the file
    input_file = open(input_file, 'r').read()
    
    # Remove the title and date (the first two rows)
    
    startChar = [word.end() for word in re.finditer("\n", input_file)][1]
    input2 = input_file[startChar:]
    
    # Remove things in angle quotes which are added to account for crowd reactions
    input2 = re.sub(r"\<[^\>]*\>", '', input2)
    
    # lowercase everything so that we have less tokens to predict. i.e., no need to distinguish a vs. A
    #input2 = input2.lower()
    
    # Standardize contractions
    for k, v in contractions_dict.items():
        input2 = input2.replace(k, v) 
        
        k_caps = k[:1].upper() + k[1:]
        v_caps = v[:1].upper() + v[1:]
        
        input2 = input2.replace(k_caps, v_caps)
        
    # Replace country acronyms
    for k, v in country_acronyms.items():
        input2 = input2.replace(k, v)
        
    # Remove middle initial
    input2 = re.sub(r"([A-Z])\W ", '', input2)
    
    # Keep all the words and digitis
    # Keep only two special characters: . and ,
    # If we want to keep carriage return, add |\n
    tokenizer = RegexpTokenizer(r'\w+|[\.\,]')
    tokens = tokenizer.tokenize(input2)
    output_file = " ".join(tokens)
    output_file = output_file + "<speech_sep>"
    
    return output_file

#### Preparing the datasets

In [8]:
# Select a president to build models on
pres = ['obama']

# split_pct = [training_pct, validation_pct, test_pct]
split_pct = [.4, .4, .3]

# Set sed number
np.random.seed(266)

# Get current directory
cwd = getcwd()

for pre in pres:
    
    dir_output = f"1.DataPreparationResults/{pre}"
    
    if not os.path.exists(dir_output):
        os.makedirs(dir_output)
    
    out_train = open(f"{dir_output}/train.txt","w+")
    out_val = open(f"{dir_output}/val.txt","w+")
    out_test = open(f"{dir_output}/test.txt","w+")
    dir_president = f"CorpusOfPresidentialSpeeches/{pre}"
    
    # onlyfiles contains a list of files (not directories) under path_president
    # Reference: https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory
    onlyfiles_lst = [f for f in listdir(dir_president) if isfile(join(dir_president, f))]
    num_of_files = len(onlyfiles_lst)

    # Reference: https://stackoverflow.com/questions/15511349/select-50-items-from-list-at-random-to-write-to-file/39585770
    files_train_arr = np.random.choice(onlyfiles_lst, round(num_of_files*split_pct[0]), replace=False)

    # Set substraction: https://stackoverflow.com/questions/3428536/python-list-subtraction-operation
    files_val_test_lst = list(set(onlyfiles_lst) - set(files_train_arr))
    files_val_arr = np.random.choice(files_val_test_lst, round(len(files_val_test_lst)*split_pct[1]/(split_pct[1]+split_pct[2])), replace=False)
    files_test_arr = np.array(list((set(files_val_test_lst) - set(files_val_arr))))
    
    for root, dirs, files in os.walk(dir_president, topdown=False):
        for file in files:
            path = f"{root}/{file}"
            out_text = tokenize_words(path)
            
            if file in files_train_arr:
                out_train.write(out_text)
            elif file in files_val_arr:
                out_val.write(out_text)
            elif file in files_test_arr:
                out_test.write(out_text)
    


    print('Training set:')
    print(files_train_arr)
    print('Validation set:')
    print(files_val_arr)
    print('Test set:')
    print(files_test_arr)
    print("Done")

Training set:
['obama_speeches_028.txt' 'obama_speeches_015.txt'
 'obama_speeches_018.txt' 'obama_speeches_045.txt'
 'obama_speeches_035.txt' 'obama_speeches_000.txt'
 'obama_speeches_041.txt' 'obama_speeches_039.txt'
 'obama_speeches_019.txt' 'obama_speeches_001.txt'
 'obama_speeches_003.txt' 'obama_speeches_032.txt'
 'obama_speeches_040.txt' 'obama_speeches_029.txt'
 'obama_speeches_002.txt' 'obama_speeches_014.txt'
 'obama_speeches_005.txt' 'obama_speeches_033.txt'
 'obama_speeches_016.txt']
Validation set:
['obama_speeches_004.txt' 'obama_speeches_031.txt'
 'obama_speeches_027.txt' 'obama_speeches_043.txt'
 'obama_speeches_047.txt' 'obama_speeches_017.txt'
 'obama_speeches_037.txt' 'obama_speeches_011.txt'
 'obama_speeches_007.txt' 'obama_speeches_022.txt'
 'obama_speeches_042.txt' 'obama_speeches_044.txt'
 'obama_speeches_034.txt' 'obama_speeches_048.txt'
 'obama_speeches_021.txt' 'obama_speeches_049.txt'
 'obama_speeches_010.txt']
Test set:
['obama_speeches_046.txt' 'obama_speech

In [13]:
tokenize_words("CorpusOfPresidentialSpeeches/obama/obama_speeches_001.txt")

'I stand here today humbled by the task before us , grateful for the trust you have bestowed , mindful of the sacrifices borne by our ancestors . I thank President Bush for his service to our nation , as well as the generosity and cooperation he has shown throughout this transition . Forty four Americans have now taken the presidential oath . The words have been spoken during rising tides of prosperity and the still waters of peace . Yet , every so often the oath is taken amidst gathering clouds and raging storms . At these moments , America has carried on not simply because of the skill or vision of those in high office , but because We the People have remained faithful to the ideals of our forbearers , and true to our founding documents . So it has been . So it must be with this generation of Americans . That we are in the midst of crisis is now well understood . Our nation is at war , against a far reaching network of violence and hatred . Our economy is badly weakened , a consequen

In [77]:
# It's possible that digits in the validation/test sets are not training set
# To make sure every character/digit can be converted to a number 
#     and subsequently scored appropriately for validation/test sets,
# We define chars_lst as all possible characters/digits we can observe from training/validaiton/test sets
# The code below only captures characters/digits in the training set and thus inappropriate
#     chars_lst = sorted(list(set(tokenized_file)))
# Reference: https://stackoverflow.com/questions/16060899/alphabet-range-on-python

chars_lst = [' ',',','.'] + [str(i) for i in range(10)] + [chr(i) for i in range(ord('a'),ord('z')+1)]