In [121]:
import os
import re
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [134]:
with open('./data/ekman/labels.txt', 'r', encoding='utf-8') as file:
    labels = [line.strip() for line in file]
print(labels)

def convert_class_label(label):
    try:
        return labels[int(label)]
    except (ValueError, IndexError):
        return label  # If label is not valid, keep it unchanged
    
# 生成標籤的 .pkl 檔案
def generate_label_vocab(file_path):
    with open(file_path, 'r') as f:
        labels = [line.strip() for line in f.readlines()]
    label_vocab = {label: idx for idx, label in enumerate(labels)}
    torch.save(label_vocab, './label_vocab.pkl')
    print(f"Label vocabulary saved to './label_vocab.pkl'")

['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']


In [136]:
generate_label_vocab('./data/ekman/labels.txt')

Label vocabulary saved to './label_vocab.pkl'


In [123]:
train_data = pd.read_csv("./data/ekman/train.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])
test_data = pd.read_csv("./data/ekman/train.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])
dev_data = pd.read_csv("./data/ekman/train.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])

In [124]:
train_data['Class List'] = train_data['Class'].apply(lambda x: x.split(','))
# train_data['Class Length'] = train_data['Class List'].apply(lambda x: len(x))
train_data = train_data.drop(columns=['ID'])

dev_data['Class List'] = dev_data['Class'].apply(lambda x: x.split(','))
# dev_data['Class Length'] = dev_data['Class List'].apply(lambda x: len(x))
dev_data = dev_data.drop(columns=['ID'])

test_data['Class List'] = test_data['Class'].apply(lambda x: x.split(','))
# test_data['Class Length'] = test_data['Class List'].apply(lambda x: len(x))
test_data = test_data.drop(columns=['ID'])

In [125]:
# 預處理
def preprocess_text(text):
    text = text.lower()  # 轉換為小寫
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # 使用正則表達式匹配並移除所有網址
    text = re.sub(r'\@w+|\#', '', text)  # 移除 @ 和 #
    text = re.sub(r'[^A-Za-z\s]', '', text)  # 移除所有非字母
    text = re.sub(r'\s+', ' ', text).strip()  # 移除多餘的空格
    return text

In [126]:
train_data['Text'] = train_data['Text'].apply(preprocess_text)
dev_data['Text'] = dev_data['Text'].apply(preprocess_text)
test_data['Text'] = test_data['Text'].apply(preprocess_text)

In [127]:
# Apply label conversion to each dataset
train_data.iloc[:, 1] = train_data.iloc[:, 1].apply(convert_class_label)
dev_data.iloc[:, 1] = dev_data.iloc[:, 1].apply(convert_class_label)
test_data.iloc[:, 1] = test_data.iloc[:, 1].apply(convert_class_label)

In [128]:
train_data

Unnamed: 0,Text,Class,Class List
0,my favourite food is anything i didnt have to ...,neutral,[4]
1,now if he does off himself everyone will think...,neutral,[4]
2,why the fuck is bayless isoing,anger,[0]
3,to make her feel threatened,fear,[2]
4,dirty southern wankers,anger,[0]
...,...,...,...
43405,added you mate well ive just got the bow and i...,joy,[3]
43406,always thought that was funny but is it a refe...,surprise,[6]
43407,what are you talking about anything bad that h...,anger,[0]
43408,more like a baptism with sexy results,joy,[3]


In [129]:
# Save processed data
save_path = 'data/ekman/processed'
if not os.path.exists(save_path):
    os.makedirs(save_path)
train_data.to_csv('./data/ekman/processed/train.tsv', sep='\t', index=False)
dev_data.to_csv('./data/ekman/processed/dev.tsv', sep='\t', index=False)
test_data.to_csv('./data/ekman/processed/test.tsv', sep='\t', index=False)

In [130]:
train_set, val_set = train_test_split(train_data, test_size=0.1, random_state=42)

In [131]:
# 定義MLP模型
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(28*28, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.dropout1 = nn.Dropout(0.2)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.dropout2 = nn.Dropout(0.2)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.dropout3 = nn.Dropout(0.2)
        self.fc4 = nn.Linear(128, 64)
        self.bn4 = nn.BatchNorm1d(64)
        self.dropout4 = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        self.fc5 = nn.Linear(64, 10)
        
    def forward(self, x):
        out = x.view(-1, 28*28)
        out = torch.relu(self.bn1(self.fc1(out)))
        out = self.dropout1(out)
        out = torch.relu(self.bn2(self.fc2(out)))
        out = self.dropout2(out)
        out = torch.relu(self.bn3(self.fc3(out)))
        out = self.dropout3(out)
        out = torch.relu(self.bn4(self.fc4(out)))
        out = self.dropout4(out)
        out = self.fc5(out)
        return out

In [None]:
# 訓練模型的 function
def train_model(train_data, train_labels, input_size, hidden_size, output_size, epochs=20, batch_size=32, learning_rate=0.001):
    # 將訓練數據包裝為 TensorDataset
    dataset = TensorDataset(train_data, train_labels)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # 初始化模型、損失函數和優化器
    model = MLP(input_size, hidden_size, output_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # 訓練循環
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for batch_data, batch_labels in dataloader:
            # 前向傳播
            outputs = model(batch_data)
            loss = criterion(outputs, batch_labels)

            # 反向傳播和優化
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # 每個 epoch 結束後打印損失
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(dataloader):.4f}")

    print("訓練完成!")
    return model