In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install xlsxwriter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xlsxwriter
  Downloading XlsxWriter-3.1.0-py3-none-any.whl (152 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.7/152.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.1.0


In [3]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

In [4]:
def load_dialogue_file(root_path, folder_id, dialogue_id, label_mapping, final_df):
    # Construct folder path
    folder_path = os.path.join(root_path, folder_id)
    # Construct file path
    file_path = os.path.join(folder_path, f"{dialogue_id}.xlsx")
    # Read Excel File
    excel_file = pd.read_excel(file_path)
    # Convert Excel File to DataFrame
    excel_df = pd.DataFrame(excel_file)
    speakers, utterances, labels = [], [], []
    for u1, u2, label in zip(excel_df["語者一"], excel_df["語者二"], excel_df["情感"]):
        if (not pd.isna(u1) and str(u1).strip()) and (not pd.isna(u2) and str(u2).strip()):
            raise Exception("Multiple utterances appeared in the same row!")
        if not pd.isna(u1) and str(u1).strip():
            u1 = str(u1).strip()
            speakers.append("語者一")
            utterances.append(u1)
        if not pd.isna(u2) and str(u2).strip():
            u2 = str(u2).strip()
            speakers.append("語者二")
            utterances.append(u2)
        if not pd.isna(label) and str(label).strip():
            label = str(label).strip()
            if label not in label_mapping.keys():
                raise Exception(f"Label key Error! Label: {label}")
            labels.append(label)
    # Not yet lebeled
    if len(utterances) > 0 and len(labels) == 0:
        labels = [np.nan for _ in range(len(utterances))]
    utterance_ids = [f"{dialogue_id}-{str(i).zfill(3)}" for i in range(len(utterances))]
    # Append dialogue data into final_df
    assert len(utterances) == len(labels)
    for utterance_id, speaker, utterance, label in zip(utterance_ids, speakers, utterances, labels):
        sentiment = label_mapping[label] if not pd.isna(label) else np.nan
        new_data = {"Dialogue_ID": dialogue_id, "Utterance_ID": utterance_id, "Speaker": speaker, "Utterance": utterance, "Sentiment": sentiment}
        final_df = final_df.append(new_data, ignore_index=True)
    return final_df

In [5]:
def load_all_dialogue_files(root_path, total_dialogue_count):
    final_df = pd.DataFrame(columns=["Dialogue_ID", "Utterance_ID", "Speaker", "Utterance", "Sentiment"])
    label_mapping = {"1": "negative", "2": "neutral", "3": "positive"}
    dialogue_ids = [f"dialogue-{str(i).zfill(4)}" for i in range(0, total_dialogue_count)]
    folder_ids = [f"dialogue {str(i).zfill(4)}-{str(i+99).zfill(4)}" for i in range(0, total_dialogue_count, 100)]
    for i in tqdm(range(len(folder_ids))):
        for j in range(i*100, i*100+100):
            folder_id = folder_ids[i]
            dialogue_id = dialogue_ids[j]
            # print(folder_id, dialogue_id)
            final_df = load_dialogue_file(root_path, folder_id, dialogue_id, label_mapping, final_df)
    return final_df

In [6]:
# Main
root_path = "/content/drive/MyDrive/論文資料完整版/dialogues"
total_dialogue_count = 2000
final_df = load_all_dialogue_files(root_path, total_dialogue_count)

100%|██████████| 20/20 [17:51<00:00, 53.55s/it]


In [7]:
print(final_df.shape)
final_df.head(20)

(0, 5)


Unnamed: 0,Dialogue_ID,Utterance_ID,Speaker,Utterance,Sentiment
