In [3]:
import json
import re
from collections import Counter


with open('../data/pitt_cookie_dataset.json', 'r') as file:
    data = json.load(file)

In [4]:
def clean_string(s):
    # 1. Remove [...] and (...)
    s = re.sub(r'\[.*?\]', '', s)
    s = re.sub(r'\(.*?\)', '', s)


    # 2. Remove all occurrences of "<" and ">", even if they're next to other characters
    s = s.replace('+<','').replace('<', '').replace('>', '')

    # 3. Remove "&=", "&-", "&+"
    s = re.sub(r'&[-\+]', '', s)
    s = re.sub(r'&=\S*\b\s*', '', s)


    # 4. Remove "+/" and "+//"
    s = s.replace('+//?','?').replace('+//.', '.').replace('+/.', '.').replace('/.', '.').replace('+/?', '?')

    # 5. Remove "‡"
    s = s.replace('‡', '')

    # 6. Replace "+..." with "." and "+..?" with "?"
    s = s.replace('+...', '.').replace('+..?', '?')

    # 7. Remove "+\"" and "+,"
    s = s.replace('+"', '').replace('+,','')

    # 8. Remove all phrases containing "@"
    s = re.sub(r'\b\w*@\w*\b\s*', '', s)
    s = re.sub(r'\b(kɜ˞@u|mɪndə˞@u|mɪdə˞@u|mɪdnə˞@u|kɪtʃə˞@u)\b\s*', '', s)
    
    # 9. Remove all ':'
    s = s.replace(':', '')

    # Remove period
    # s = re.sub(r'\.', '', s)

    return s


In [171]:
# data.keys()


In [5]:
sentences_list = []

for key in list(data.keys()):
    # print(key)
    if key == "dementia":
        label = 1
    else:
        label = 0
    group = data[key]
    for file_key in data[key]:
        # print(file_key)
        file = group[file_key]
        result = ''
        lines = file['lines']
        for line in lines:
            if line['speaker'] == "PAR":
                utter = clean_string(line['utterance'])
                # utter = line['utterance']
                if result == '':
                    result = utter
                      #first line in a file
                else:
                    result = result + "\n " + utter #separator token
        # print(result)
        sentences_list.append({"line":result, "label": label})
# sentences_list

In [197]:
# for i in sentences_list:
#     a = i["line"].split("\n")
#     for line in a:
#         if 's s s' in line:
#             print(i, "\n")

{'line': "(.) what do ?  \n mhm  mhm .  \n do you want them crossed out or just say it out loud ?  \n  okay  uh the little boy's climbing the cri cookie jar . \n uh he has  the little boy has  has cookies  a cookie in his hand that he got by climbing the step ladder which is ready to fall . \n the sister is asking for uh something to eat . \n she has started little and wants some more . \n uh let's see now .  \n the mother  the mother has s s a small mess in the kitchen lucky it's  lucky it's small . \n uh the mother is now washing and dr no yeah  she's washing and drying the dishes in the kitchen . \n her water has spilled over terribly bad . \n and looks like sister's back to try for some more cookies . \n okay .  ", 'label': 1} 

{'line': "well  the boy is in the cookie jar . \n and the  da and his sister is trying to uh take it from him . \n and he's uh on a stool going like this . \n and uh then there's uh l uh s a female over here with a plate in her hand that she's dryin(g) dish

In [6]:
len(sentences_list) == len(data["dementia"]) + len(data["control"])

True

In [7]:
def find_and_count_special_tokens(sentences_list):
    # special_tokens_dict = []
    all_special_tokens = []
    # Updated regex pattern to match phonetic symbols, <, &, ‡, phrases between (), and other special tokens
    special_token_pattern = re.compile(
        r'\[.*?\]|\&-\w+|\+|\:|\=.*|[:;=]|\(\.\)|\(\.\.\)|\/\/|\/|<.*?>|\(.*?\)|\(g\)|\n|‡|<|&|'
        r'\bdʌ@u\b|\bkɪtʃə˞@u\b|\bmɪndə˞@u\b|\bmɪdə˞@u\b|\bmɪdnə˞@u\b'
    )
    
    for sentence in sentences_list:
        line = sentence['line']
        
        # Find all special tokens
        special_tokens = re.findall(special_token_pattern, line)
        all_special_tokens.extend(special_tokens)  # Collect all tokens for frequency counting
        
        
        # Append results to list
        # special_tokens_dict.append({
        #     'original_line': line,
        #     'special_tokens': special_tokens,
        #     'cleaned_line': cleaned_line.strip()  # Strip extra whitespace
        # })
    
    # Count frequency of each special token
    special_token_counts = Counter(all_special_tokens)
    sorted_special_token_counts = dict(sorted(special_token_counts.items(), key=lambda item: item[1], reverse=True))
    
    return sorted_special_token_counts



In [8]:
sorted_special_token_counts = find_and_count_special_tokens(sentences_list)

print("Frequency of each special token:")
for token, count in sorted_special_token_counts.items():
    print(f"{token}: {count}")

Frequency of each special token:

: 6530


In [45]:
# Output the results
# for item in special_tokens:
#     print(f"Original Line: {item['original_line']}")
#     print(f"Special Tokens: {item['special_tokens']}")
#     print(f"Cleaned Line: {item['cleaned_line']}\n")

In [24]:
with open('../data/cleaned_second_try.json', 'w', encoding='utf-8') as f:
    json.dump(sentences_list, f, ensure_ascii=False, indent=2)