In [12]:
import json
import re
from collections import Counter


with open('../data/pitt_cookie_dataset.json', 'r') as file:
    data = json.load(file)

In [60]:
def clean_string(s):
    # 1. Remove [...]
    s = re.sub(r'\[.*?\]\s', '', s)
    s = re.sub(r'\[.*?\]', '', s)

    # 2. Remove all occurrences of "<" and ">", even if they're next to other characters
    s = re.sub(r'\+\>\s|\+\<\s|\<\s|\>\s', '', s)
    s = re.sub(r'\+\>|\+\<|\<|\>', '', s)

    # 3. Remove "&=", "&-", "&+"
    s = re.sub(r'&[-\+]', '', s)
    s = re.sub(r'&=\S*\b\s*', '', s)


    # 4. Remove "+/" and "+//"
    s = s.replace('+//?','?').replace('+//.', '.').replace('+/.', '.').replace('/.', '.').replace('+/?', '?')

    # 5. Remove "‡"
    s = re.sub(r'\‡\s', '',s)

    # 6. Replace "+..." with "." and "+..?" with "?"
    s = s.replace('+...', '.').replace('+..?', '?')

    # 7. Remove "+\"" and "+,"
    s = s.replace('+\" ', '').replace('+, ','')

    # 8. Remove all phrases containing "@"
    s = re.sub(r'\b\w*@\w*\b\s*', '', s)
    s = re.sub(r'\b(kɜ˞@u|mɪndə˞@u|mɪdə˞@u|mɪdnə˞@u|kɪtʃə˞@u)\b\s*', '', s)
    
    # 9. Remove all ':'
    s = s.replace(':', '')

    # Remove (g) / (o)
    s = re.sub(r'\([a-zA-Z]+\)', '', s)

    # Remove Space
    s = re.sub(r'\s([.?!])', r'\1', s)
    s = s.replace("  ", " ")
    s = s.replace("  ", " ")

    return s


In [62]:
sentences_list = []

for key in list(data.keys()):
    # print(key)
    if key == "dementia":
        label = 1
    else:
        label = 0
    group = data[key]
    for file_key in data[key]:
        # print(file_key)
        file = group[file_key]
        result = ''
        lines = file['lines']
        index = file['index']
        for line in lines:
            if line['speaker'] == "PAR":
                utter = clean_string(line['utterance'])
                # utter = line['utterance']
                if result == '':
                    result = utter
                      #first line in a file
                else:
                    result = result + "\n " + utter #separator token
        # print(result)
        sentences_list.append({"index": index, "label": label, "line":result})

In [68]:
with open('../data/clean_v1.json', 'w', encoding='utf-8') as f:
    json.dump(sentences_list, f, ensure_ascii=False, indent=2)

## Train Test Split

In [16]:
from Split_data import split_dataset

In [28]:
train, test = split_dataset(file_path = "../data/clean_v1.json", label_column="label", random_state=42, specific=True)
train = train.set_index('index')
train = train.sort_index()
test = test.set_index('index')
text = test.sort_index()
print(train.label.value_counts())
print(test.label.value_counts())

label
1    200
0    200
Name: count, dtype: int64
label
1    106
0     43
Name: count, dtype: int64


In [29]:
train.to_csv("../data/train_v1_400.csv")
test.to_csv("../data/test_v1_149.csv")

## Data Cleaning Testing

In [67]:
for i in sentences_list:
    a = i["line"].split("\n")
    for line in a:
        if '(.)' in line:
            print(i, "\n")

{'index': 2, 'label': 1, 'line': "here's a cookie jar. \n and the lid is off the cookie jar. \n the boy is about to come down on the floor. \n and the girl. \n I don't know that much about girls. \n but anyway uh the housewife is in the kitchen. \n and the (.) the sink is overflowing. \n and (..) the girl may be saying say “I told you so” or something like that. \n and uh I guess this must be the wife although it might not be because maybe that was an apron and um maybe this was the um. \n there is(.) this this should be a window. \n there. \n the upper one is there. \n uh it um has a pathway. \n uh we don't know where the pathway is going to because we can't see it down below. \n and looks like there's some flowers in the bushes or something like that. \n and here are some draperies. \n well this one in on this side is. \n I don't see it, the other side, but I su suspect it it would be the same thing there. \n and uh the I guess I said that the the water was coming down on the floor. 

In [19]:
len(sentences_list) == len(data["dementia"]) + len(data["control"])

True

In [20]:
def find_and_count_special_tokens(sentences_list):
    # special_tokens_dict = []
    all_special_tokens = []
    # Updated regex pattern to match phonetic symbols, <, &, ‡, phrases between (), and other special tokens
    special_token_pattern = re.compile(
        r'\[.*?\]|\&-\w+|\+|\:|\=.*|[:;=]|\(\.\)|\(\.\.\)|\/\/|\/|<.*?>|\(.*?\)|\(g\)|\n|‡|<|&|'
        r'\bdʌ@u\b|\bkɪtʃə˞@u\b|\bmɪndə˞@u\b|\bmɪdə˞@u\b|\bmɪdnə˞@u\b'
    )
    
    for sentence in sentences_list:
        line = sentence['line']
        
        # Find all special tokens
        special_tokens = re.findall(special_token_pattern, line)
        all_special_tokens.extend(special_tokens)  # Collect all tokens for frequency counting
        
        
        # Append results to list
        # special_tokens_dict.append({
        #     'original_line': line,
        #     'special_tokens': special_tokens,
        #     'cleaned_line': cleaned_line.strip()  # Strip extra whitespace
        # })
    
    # Count frequency of each special token
    special_token_counts = Counter(all_special_tokens)
    sorted_special_token_counts = dict(sorted(special_token_counts.items(), key=lambda item: item[1], reverse=True))
    
    return sorted_special_token_counts



In [21]:
sorted_special_token_counts = find_and_count_special_tokens(sentences_list)

print("Frequency of each special token:")
for token, count in sorted_special_token_counts.items():
    print(f"{token}: {count}")

Frequency of each special token:

: 6530
(.): 495
(..): 197
(...): 48
