In [1]:
import json
import re
from collections import Counter


with open('../data/pitt_cookie_dataset.json', 'r') as file:
    data = json.load(file)

In [2]:
def clean_string(s):
    # 1. Remove [...] and (...)
    s = re.sub(r'\[.*?\]', '', s)
    s = re.sub(r'\(.*?\)', '', s)


    # 2. Remove all occurrences of "<" and ">", even if they're next to other characters
    s = s.replace('+<','').replace('<', '').replace('>', '')

    # 3. Remove "&=", "&-", "&+"
    s = re.sub(r'&[-\+]', '', s)
    s = re.sub(r'&=\S*\b\s*', '', s)


    # 4. Remove "+/" and "+//"
    s = s.replace('+//?','?').replace('+//.', '.').replace('+/.', '.').replace('/.', '.').replace('+/?', '?')

    # 5. Remove "‡"
    s = s.replace('‡', '')

    # 6. Replace "+..." with "." and "+..?" with "?"
    s = s.replace('+...', '.').replace('+..?', '?')

    # 7. Remove "+\"" and "+,"
    s = s.replace('+"', '').replace('+,','')

    # 8. Remove all phrases containing "@"
    s = re.sub(r'\b\w*@\w*\b\s*', '', s)
    s = re.sub(r'\b(kɜ˞@u|mɪndə˞@u|mɪdə˞@u|mɪdnə˞@u|kɪtʃə˞@u)\b\s*', '', s)
    
    # 9. Remove all ':'
    s = s.replace(':', '')

    # Remove period
    # s = re.sub(r'\.', '', s)

    return s


In [3]:
sentences_list = []

for key in list(data.keys()):
    # print(key)
    if key == "dementia":
        label = 1
    else:
        label = 0
    group = data[key]
    for file_key in data[key]:
        # print(file_key)
        file = group[file_key]
        result = ''
        lines = file['lines']
        index = file['index']
        for line in lines:
            if line['speaker'] == "PAR":
                utter = clean_string(line['utterance'])
                # utter = line['utterance']
                if result == '':
                    result = utter
                      #first line in a file
                else:
                    result = result + "\n " + utter #separator token
        # print(result)
        sentences_list.append({"index": index, "label": label, "line":result})

In [11]:
with open('../data/clean_v1.json', 'w', encoding='utf-8') as f:
    json.dump(sentences_list, f, ensure_ascii=False, indent=2)

## Train Test Split

In [4]:
from Split_data import split_dataset

In [27]:
train, test = split_dataset(file_path = "../data/clean_v1.json", label_column="label", random_state=42, specific=True)
train = train.set_index('index')
train = train.sort_index()
test = test.set_index('index')
text = test.sort_index()
print(train.label.value_counts())
print(test.label.value_counts())

label
1    250
0    200
Name: count, dtype: int64
label
1    56
0    43
Name: count, dtype: int64


In [28]:
train.to_csv("../data/train_v1_450.csv")
test.to_csv("../data/test_v1_99.csv")

## Data Cleaning Testing

In [10]:
for i in sentences_list:
    a = i["line"].split("\n")
    for line in a:
        if 's s s' in line:
            print(i, "\n")

{'index': 11, 'label': 1, 'line': "kids are trying to get a s s s s s s . \n it's full of  it's full of uh mistakes . \n it's full o mistakes . \n it's full of mistakes . \n he's changin   taking cookie jar .  \n that's all .  \n the mother's just drying the dishes . \n n n n n n n s s n s xxx from the from xxx . \n this is uh . "} 

{'index': 72, 'label': 1, 'line': " and I will tell you what's g .  \n oh boy .  \n well  the little boy is reaching for a cookie . \n and his s stool is fallin over . \n and the little girl is beggin him to give her one . \n and she's pointin to her mouth . \n she wants to eat it . \n uh their mama is doin the dishes . \n the water's runnin over the sink . \n that's a mess . \n and then she's not even lookin at them . \n dryin dishes .  \n I think she's lookin out the window . \n it's a nice yard out there . \n two cups and  and a dish finished .  \n xxx anything else ?  "} 

{'index': 151, 'label': 1, 'line': "the uh young fellow is s standing on the ste

In [5]:
len(sentences_list) == len(data["dementia"]) + len(data["control"])

True

In [7]:
def find_and_count_special_tokens(sentences_list):
    # special_tokens_dict = []
    all_special_tokens = []
    # Updated regex pattern to match phonetic symbols, <, &, ‡, phrases between (), and other special tokens
    special_token_pattern = re.compile(
        r'\[.*?\]|\&-\w+|\+|\:|\=.*|[:;=]|\(\.\)|\(\.\.\)|\/\/|\/|<.*?>|\(.*?\)|\(g\)|\n|‡|<|&|'
        r'\bdʌ@u\b|\bkɪtʃə˞@u\b|\bmɪndə˞@u\b|\bmɪdə˞@u\b|\bmɪdnə˞@u\b'
    )
    
    for sentence in sentences_list:
        line = sentence['line']
        
        # Find all special tokens
        special_tokens = re.findall(special_token_pattern, line)
        all_special_tokens.extend(special_tokens)  # Collect all tokens for frequency counting
        
        
        # Append results to list
        # special_tokens_dict.append({
        #     'original_line': line,
        #     'special_tokens': special_tokens,
        #     'cleaned_line': cleaned_line.strip()  # Strip extra whitespace
        # })
    
    # Count frequency of each special token
    special_token_counts = Counter(all_special_tokens)
    sorted_special_token_counts = dict(sorted(special_token_counts.items(), key=lambda item: item[1], reverse=True))
    
    return sorted_special_token_counts



In [8]:
sorted_special_token_counts = find_and_count_special_tokens(sentences_list)

print("Frequency of each special token:")
for token, count in sorted_special_token_counts.items():
    print(f"{token}: {count}")

Frequency of each special token:

: 6530
