# Compile text-analysis matrix

In [4]:
import sys, os, json, string
import text_analysis_util as TAU
import pandas as pd

sys.path.append('..')

import CMUSounds
import LTS_util

In [58]:
ITEM_NUMBER = '330'
OUTPUT_PREFIX = '20200820_'
OUTPUT_PATH = '../output/text-analysis-matrix/'

OUTPUT_FILE_PATH = OUTPUT_PATH + OUTPUT_PREFIX + ITEM_NUMBER + '.tsv'

In [5]:
PASSAGES_WITH_LINE_BREAKS = '../data/moby-passages-36/passages-with-line-breaks.tsv'
PASSAGES_WITH_RECSTRING = '../data/moby-passages-36/passages-with-line-break-and-recstring.tsv'

CMU = CMUSounds.CMUSounds('../data/general-resources/cmudict-0.7b.txt')
LTS = LTS_util.LTS_util()

In [6]:
TEST_item_to_recstring = {}
with open(PASSAGES_WITH_RECSTRING) as f:
    next(f)
    for line in f:
        _, _, item, _, recstring = line.strip().split('\t')
        # assert (recstring[0] == recstring[-1] == "'") or \
        #     (recstring[0] == recstring[-1] == '"'), recstring
        TEST_item_to_recstring[item] = recstring.split()

item_to_rough_tokens = {}
with open(PASSAGES_WITH_LINE_BREAKS) as f:
    for line in f:
        item, full_passage = line.strip().split('\t')
        assert (full_passage[0] == full_passage[-1] == "'") or \
            (full_passage[0] == full_passage[-1] == '"'), 'Ill-formed passage'
        full_passage = full_passage[1:-1]

        tokenized_passage = TAU.text_analysis_tokenize(full_passage)
        item_to_rough_tokens[item] = tokenized_passage

        recstring = TAU.turn_tokens_to_recstring(tokenized_passage)
        expected = TEST_item_to_recstring[item]

        
        expected2 = []
        for token in tokenized_passage:
            token_processed = TAU.turn_token_to_recstring_format(token)
            if token_processed:
                expected2.append(token_processed)

        assert expected == recstring, (recstring, expected)
        assert expected2 == recstring, (recstring, expected2)
    print('Works!')

Works!


In [7]:
# for item, rough_tokens in item_to_rough_tokens.items():
#     for token in rough_tokens:
#         if TAU.given_word_test_if_word_is_end_of_sentence(token):
#             print(token)

## Step I.

Collect source passage-dependent text-analysis features first (line break? paragraph break? preceded/followed by punctuation?)

In [8]:
def get_source_passage_dependent_features(source_passage):
    LAST_INDEX = len(source_passage) - 1
    rv = []
    for idx, token in enumerate(source_passage):
        token_is_word = ('*' != token != '$')

        if not token_is_word:
            continue

        # features
        word_is_start_of_line      = False
        word_is_end_of_line        = False
        word_is_start_of_paragraph = False
        word_is_end_of_paragraph   = False

        word_contains_punctuation  = False
        word_is_start_of_sentence  = False
        word_is_end_of_sentence    = False

        if idx == 0:
            word_is_start_of_line      = True
            word_is_start_of_paragraph = True
            word_is_start_of_sentence  = True
        
        if idx == LAST_INDEX:
            word_is_end_of_line        = True
            word_is_end_of_paragraph   = True
            word_is_end_of_sentence    = True

        if idx != 0:
            prev_token = source_passage[idx - 1]
            if prev_token == '*':
                prev_token = source_passage[idx - 2]
                word_is_start_of_paragraph = True
                word_is_start_of_line = True
            if prev_token == '$':
                prev_token = source_passage[idx - 2]
                word_is_start_of_line = True
            if TAU.given_word_test_if_word_is_end_of_sentence(prev_token):
                word_is_start_of_sentence = True
            
        if idx != LAST_INDEX:
            next_token = source_passage[idx + 1]
            if next_token == '*':
                word_is_end_of_paragraph = True
                word_is_end_of_line = True
            if next_token == '$':
                word_is_end_of_line = True
        
        if TAU.given_word_test_if_word_is_end_of_sentence(token):
            word_is_end_of_sentence = True
        
        if set(string.punctuation) & set(token):
            word_contains_punctuation = True

        features = {
            'word_is_start_of_line': word_is_start_of_line,
            'word_is_end_of_line': word_is_end_of_line,
            'word_is_start_of_paragraph': word_is_start_of_paragraph,
            'word_is_end_of_paragraph': word_is_end_of_paragraph,
            'word_contains_punctuation': word_contains_punctuation,
            'word_is_start_of_sentence': word_is_start_of_sentence,
            'word_is_end_of_sentence': word_is_end_of_sentence,
        }
        token_processed = TAU.turn_token_to_recstring_format(token)
        rv.append((token_processed, features))
    return rv

In [9]:
TEST_passage_rough_tokens = item_to_rough_tokens['330']
TEST_passage_feature_list = get_source_passage_dependent_features(TEST_passage_rough_tokens)

In [10]:
def add_feature(data, feature_function, feature_name):
    rv = data
    for idx, (word, existing_features) in enumerate(data):
        feature_value_to_add = feature_function(word)
        existing_features[feature_name] = feature_value_to_add
        rv[idx] = (word, existing_features)
    return rv

In [11]:
def FEATURE_FN_WORD_LENGTH(word):
    word_with_only_letters = [letter for letter in word if letter.isalpha()]
    return len(word_with_only_letters)

In [12]:
def FEATURE_FN_CMU_LENGTH(word):
    return len(CMU.get(word).split())

In [24]:
def FEATURE_FN_LTS_1(word):
    return 1 in LTS.get(word)['lts']
    
def FEATURE_FN_LTS_2(word):
    return 2 in LTS.get(word)['lts']
    
def FEATURE_FN_LTS_3(word):
    return 3 in LTS.get(word)['lts']
    
def FEATURE_FN_LTS_4(word):
    return 4 in LTS.get(word)['lts']
    
def FEATURE_FN_LTS_5(word):
    return 5 in LTS.get(word)['lts']
    
def FEATURE_FN_LTS_6(word):
    return 6 in LTS.get(word)['lts']
    
def FEATURE_FN_LTS_7(word):
    return 7 in LTS.get(word)['lts']
    
def FEATURE_FN_LTS_8(word):
    return 8 in LTS.get(word)['lts']
    
def FEATURE_FN_LTS_9(word):
    return 9 in LTS.get(word)['lts']
    
def FEATURE_FN_LTS_10(word):
    return 10 in LTS.get(word)['lts']
    
def FEATURE_FN_LTS_11(word):
    return 11 in LTS.get(word)['lts']
    
def FEATURE_FN_LTS_12(word):
    return 12 in LTS.get(word)['lts']

In [26]:
def FEATURE_FN_MORPHS(word):
    return LTS.get(word)['n_morphs']
    
def FEATURE_FN_DECODABLE(word):
    return LTS.get(word)['decodable']

In [54]:
def FEATURE_FN_SIGHTWORD_PP(word):
    return -1 in LTS.get(word)['sight_word']

def FEATURE_FN_SIGHTWORD_P(word):
    return 0 in LTS.get(word)['sight_word'] or -1 in LTS.get(word)['sight_word']

def FEATURE_FN_SIGHTWORD_1(word):
    return LTS.get(word)['sight_word'] != []

In [55]:
PASSAGE_ROUGH_TOKENS = item_to_rough_tokens[ITEM_NUMBER]
PASSAGE_TOKENS_AND_FEATURE_LISTS = get_source_passage_dependent_features(PASSAGE_ROUGH_TOKENS)

FEATURES_TO_ADD = [
    (FEATURE_FN_WORD_LENGTH, 'word_length'),
    (FEATURE_FN_CMU_LENGTH, 'CMU_length'),
    (FEATURE_FN_LTS_1, 'lts_1'),
    (FEATURE_FN_LTS_2, 'lts_2'),
    (FEATURE_FN_LTS_3, 'lts_3'),
    (FEATURE_FN_LTS_4, 'lts_4'),
    (FEATURE_FN_LTS_5, 'lts_5'),
    (FEATURE_FN_LTS_6, 'lts_6'),
    (FEATURE_FN_LTS_7, 'lts_7'),
    (FEATURE_FN_LTS_8, 'lts_8'),
    (FEATURE_FN_LTS_9, 'lts_9'),
    (FEATURE_FN_LTS_10, 'lts_10'),
    (FEATURE_FN_LTS_11, 'lts_11'),
    (FEATURE_FN_LTS_12, 'lts_12'),
    (FEATURE_FN_MORPHS, 'n_morphs'),
    (FEATURE_FN_DECODABLE, 'is_decodable'),
    (FEATURE_FN_SIGHTWORD_PP, 'sightword_pp'),
    (FEATURE_FN_SIGHTWORD_P, 'sightword_p'),
    (FEATURE_FN_SIGHTWORD_1, 'sightword_1')
]

for feature_fn, feature_fn_name in FEATURES_TO_ADD:
    PASSAGE_TOKENS_AND_FEATURE_LISTS = add_feature(PASSAGE_TOKENS_AND_FEATURE_LISTS, feature_fn, feature_fn_name)

In [56]:
PASSAGE_TOKENS_ONLY = [x[0] for x in PASSAGE_TOKENS_AND_FEATURE_LISTS]
PASSAGE_FEATURES = [x[1] for x in PASSAGE_TOKENS_AND_FEATURE_LISTS]
PASSAGE_DF = pd.DataFrame(PASSAGE_FEATURES)
PASSAGE_DF.index = PASSAGE_TOKENS_ONLY

In [60]:
PASSAGE_DF.to_csv(OUTPUT_FILE_PATH, sep='\t')