In [92]:
import pandas as pd
pd.set_option("display.max_colwidth", None)


test = pd.read_csv('data/test.csv')
train = pd.read_csv('data/train.csv')
valid = pd.read_csv('data/validation.csv')

In [93]:
df = pd.concat([train, valid, test], ignore_index=True)
df = df.drop_duplicates(subset=['dialog'])
df = df.drop(columns=['act', 'emotion'])

In [94]:
df.shape

(12376, 1)

In [95]:
import ast

def clean(text):
    text = text.lower()
    return text

df['dialog'] = df['dialog'].apply(clean)

raw = df["dialog"].iloc[0]
turns = ast.literal_eval(raw)



In [96]:
df.head()

Unnamed: 0,dialog
0,"['say , jim , how about going for a few beers after dinner ? '\n ' you know that is tempting but is really not good for our fitness . '\n ' what do you mean ? it will help us to relax . '\n "" do you really think so ? i don't . it will just make us fat and act silly . remember last time ? ""\n "" i guess you are right.but what shall we do ? i don't feel like sitting at home . ""\n ' i suggest a walk over to the gym where we can play singsong and meet some of our friends . '\n "" that's a good idea . i hear mary and sally often go there to play pingpong.perhaps we can make a foursome with them . ""\n ' sounds great to me ! if they are willing , we could ask them to go dancing with us.that is excellent exercise and fun , too . '\n "" good.let ' s go now . "" ' all right . ']"
1,"['can you do push-ups ? '\n "" of course i can . it's a piece of cake ! believe it or not , i can do 30 push-ups a minute . ""\n "" really ? i think that's impossible ! "" ' you mean 30 push-ups ? '\n ' yeah ! '\n "" it's easy . if you do exercise everyday , you can make it , too . ""]"
2,"['can you study with the radio on ? '\n ' no , i listen to background music . ' ' what is the difference ? '\n ' the radio has too many comerials . '\n "" that's true , but then you have to buy a record player . ""]"
3,"['are you all right ? '\n ' i will be all right soon . i was terrified when i watched them fall from the wire . '\n "" don't worry.he is an acrobat 。 "" ' i see . ']"
4,"['hey john , nice skates . are they new ? '\n ' yeah , i just got them . i started playing ice hockey in a community league . so , i finally got myself new skates . '\n ' what position do you play ? '\n ' i ’ m a defender . it ’ s a lot of fun . you don ’ t have to be able to skate as fast on defense . '\n ' yeah , you ’ re a pretty big guy . i play goalie , myself . '\n ' oh , yeah ? which team ? ' ' the rockets . '\n ' really ? i think we play you guys next week . well , i have to go to practice . see you later . '\n ' all right , see you later . ']"


In [97]:
import re
def split_broken_turns(text):
    parts = re.split(r'"\s+\'|\'\s+\'|\'\s+\"', text)
    return [p.strip() for p in parts if p.strip()]

def parse_dialog(raw):
    raw = raw.strip()
    if raw.startswith("[") and raw.endswith("]"):
        raw = raw[1:-1]
    parts = raw.split("\n")
    turns = []
    for p in parts:
        p = p.strip().strip("'").strip('"')
        sub_turns = split_broken_turns(p)
        for t in sub_turns:
            t = re.sub(r"\s+([.,!?;:])", r"\1", t)
            t = re.sub(r"([.!?])([A-Za-z])", r"\1 \2", t)
            t = re.sub(r"\s*’\s*", "'", t)
            t = re.sub(r"\b(\w+)\s+'\s+(\w+)\b", r"\1'\2", t)
            t = t.strip()
            if t:
                turns.append(t)
    return turns

df["dialog"] = df["dialog"].apply(parse_dialog)

In [98]:
dialog = []
for d in df["dialog"]:
    text = ""
    for i, utterance in enumerate(d):
        utterance = clean(utterance)

        if i % 2 == 0:
            text += f"[user] {utterance}\n"
        else:
            text += f"[bot] {utterance}\n"

    text += "[end]\n"
    dialog.append(text)

In [99]:
dialog[0]

"[user] say, jim, how about going for a few beers after dinner?\n[bot] you know that is tempting but is really not good for our fitness.\n[user] what do you mean? it will help us to relax.\n[bot] do you really think so? i don't. it will just make us fat and act silly. remember last time?\n[user] i guess you are right. but what shall we do? i don't feel like sitting at home.\n[bot] i suggest a walk over to the gym where we can play singsong and meet some of our friends.\n[user] that's a good idea. i hear mary and sally often go there to play pingpong. perhaps we can make a foursome with them.\n[bot] sounds great to me! if they are willing, we could ask them to go dancing with us. that is excellent exercise and fun, too.\n[user] good. let's go now.\n[bot] all right.\n[end]\n"

In [100]:
with open("data/processed_dialogs.txt", "w", encoding="utf-8") as f:
    f.write("".join(dialog))