In [1]:
# downloading the dataset and the Glove word embeddings
! brew install wget
! wget https://github.com/matthen/dstc/releases/download/v1/dstc2_traindev.tar.gz
! wget https://github.com/matthen/dstc/releases/download/v1/dstc2_test.tar.gz
! wget https://nlp.stanford.edu/data/glove.6B.zip

[34m==>[0m [1mDownloading https://formulae.brew.sh/api/formula.jws.json[0m
######################################################################### 100.0%
[34m==>[0m [1mDownloading https://formulae.brew.sh/api/cask.jws.json[0m
######################################################################### 100.0%
To reinstall 1.24.5, run:
  brew reinstall wget
--2024-06-05 23:46:28--  https://github.com/matthen/dstc/releases/download/v1/dstc2_traindev.tar.gz
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/295903958/9cf74180-f80b-11ea-997a-a3b76087e723?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20240605%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240605T214628Z&X-Amz-Expires=300&X-Amz-Signature=fab4b2ca4ddb366daefc16373f06c9f215d2c2f86b294933629d3

In [2]:
import os
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset


## Step 1: Download and Load Data

In [3]:
# loading the train and test data from the tar.gz files
import tarfile

train_tar_path = 'dstc2_traindev.tar.gz'
test_tar_path = 'dstc2_test.tar.gz'

# Extracting the tar.gz files to the dstc2_traindev and dstc2_test directories
with tarfile.open(train_tar_path, 'r:gz') as tar:
    tar.extractall(path='dstc2_traindev')

with tarfile.open(test_tar_path, 'r:gz') as tar:
    tar.extractall(path='dstc2_test')


In [4]:
# Load JSON files
def load_json(filepath):
    with open(filepath, 'r') as f:
        return json.load(f)

def get_highest_score_asr(asr_hyps):
    return max(asr_hyps, key=lambda hyp: hyp['score'])

def extract_data(label, log):
    extracted_data = []
    for turn in log['turns']:
        highest_score_asr = get_highest_score_asr(turn['input']['live']['asr-hyps'])
        dialog_act_text = highest_score_asr['asr-hyp']
        turn_index = turn['turn-index']
        corresponding_label_turn = next(t for t in label['turns'] if t['turn-index'] == turn_index)
        dialog_act_label = corresponding_label_turn['semantics']['cam']
        
        extracted_data.append({
            "dialog_act_text": dialog_act_text,
            "dialog_act_label": dialog_act_label
        })
    return extracted_data

def process_all_files(base_dir):
    data = []
    for root, dirs, files in os.walk(base_dir):
        if 'label.json' in files and 'log.json' in files:
            label_path = os.path.join(root, 'label.json')
            log_path = os.path.join(root, 'log.json')
            label = load_json(label_path)
            log = load_json(log_path)
            session_data = extract_data(label, log)
            data.extend(session_data)
    return data

base_train_dir = 'dstc2_traindev/data'
base_test_dir = 'dstc2_test/data'

train_data = process_all_files(base_train_dir)
test_data = process_all_files(base_test_dir)

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

# save the train and test dfs to a csv file
train_df.to_csv('train_data.csv', index=True)
test_df.to_csv('test_data.csv', index=True)


In [5]:
# unzip the glove embeddings file
import zipfile
import numpy as np

with zipfile.ZipFile('glove.6B.zip', 'r') as zip_ref:
    zip_ref.extractall('glove.6B')

In [6]:
# Load GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_file_path = 'glove.6B/glove.6B.100d.txt'
word_embeddings = load_glove_embeddings(glove_file_path)


## Step 2: Implement Dataset Class

In [7]:
# class TextDataset(Dataset):
#     def __init__(self, text, labels, embeddings):
#         self.text = text
#         self.labels = labels
#         self.embeddings = embeddings
#         self.dim = len(next(iter(embeddings.values())))  # Dimension of embeddings
#         self.label_to_int = {label: i for i, label in enumerate(set(labels))}  # Create mapping from labels to integers
                
#     def __len__(self):
#         return len(self.labels)

#     def __getitem__(self, idx):
#         sentence = self.text[idx]
#         embedding_vectors = [self.embeddings[word] for word in sentence.split() if word in self.embeddings]
#         if embedding_vectors:
#             sentence_embedding = np.mean(embedding_vectors, axis=0)
#         else:
#             # If none of the words in the sentence are in the embeddings, return zeros
#             sentence_embedding = np.zeros(self.dim)
#         label = self.label_to_int[self.labels[idx]] / len(self.label_to_int)  # Normalize label to be between 0 and 1
#         print(f"Label: {label}")
#         return torch.tensor(sentence_embedding, dtype=torch.float), torch.tensor(label, dtype=torch.float)

class TextDataset(Dataset):
    def __init__(self, text, labels, embeddings, dim):
        self.text = text
        self.labels = labels
        self.embeddings = embeddings
        self.dim = dim
                
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sentence = self.text[idx]
        embedding_vectors = [self.embeddings[word.lower()] for word in sentence.split() if word.lower() in self.embeddings]
        if embedding_vectors:
            sentence_embedding = np.mean(embedding_vectors, axis=0)
        else:
            sentence_embedding = np.zeros(self.dim)
        label = 1 if self.labels[idx] == "inform" else 0  # Convert labels to binary
        return torch.tensor(sentence_embedding, dtype=torch.float), torch.tensor(label, dtype=torch.float)


dim = len(next(iter(word_embeddings.values())))
train_dataset = TextDataset(train_df['dialog_act_text'], train_df['dialog_act_label'], word_embeddings, dim)
test_dataset = TextDataset(test_df['dialog_act_text'], test_df['dialog_act_label'], word_embeddings, dim)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [8]:
print(f"Number of training examples: {len(train_dataset)}")
print(f"Number of test examples: {len(test_dataset)}")
print(f"Embedding dimension: {dim}")
print("Number of train dataloader batches:", len(train_dataloader))
print("Number of test dataloader batches:", len(test_dataloader))
print(train_dataset.__getitem__(0))

Number of training examples: 15611
Number of test examples: 9890
Embedding dimension: 100
Number of train dataloader batches: 488
Number of test dataloader batches: 310
(tensor([-0.1284,  0.2365,  0.0348, -0.2114, -0.0207,  0.4156,  0.0400,  0.0508,
         0.2099,  0.1153, -0.2247, -0.0582,  0.2855,  0.1336,  0.1801,  0.1997,
         0.1618, -0.1618, -0.6136,  0.1638,  0.2289,  0.2042,  0.0741, -0.2789,
         0.1049,  0.3242, -0.2329, -0.4341, -0.3022,  0.0707, -0.2982,  0.4931,
         0.0655,  0.0609,  0.1835,  0.8005, -0.0021,  0.3045, -0.3090, -0.5727,
         0.0290, -0.0228, -0.0842, -0.6062,  0.4922,  0.1210, -0.7145, -0.1770,
         0.1424, -0.2483, -0.1107, -0.1112, -0.3060,  0.1717, -0.4653, -1.3783,
        -0.0728, -0.0912,  1.5972,  0.0706, -0.1195,  0.1731,  0.0717, -0.2333,
         0.5830, -0.1154,  0.2160, -0.4106,  0.1289, -0.4419, -0.3996, -0.1192,
         0.0616,  0.1092, -0.3175,  0.0700, -0.3186, -0.2148, -0.6008, -0.0552,
         0.2669, -0.0161, -0.0

## Step 3

In [9]:
# Defining the model
class NeuralNet(nn.Module):
    def __init__(self, input_dim):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        pred = model(X.float())
        loss = loss_fn(pred, y.float().unsqueeze(1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X.float())
            test_loss += loss_fn(pred, y.float().unsqueeze(1)).item()
            correct += ((pred > 0.5) == y.unsqueeze(1)).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


In [10]:
input_dim = 100
model = NeuralNet(input_dim)
loss_fn = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)
print("Done!")


Epoch 1
-------------------------------
loss: 0.688425  [    0/15611]
loss: 0.009006  [ 3200/15611]
loss: 0.000002  [ 6400/15611]
loss: 0.004066  [ 9600/15611]
loss: 0.000000  [12800/15611]
Test Error: 
 Accuracy: 100.0%, Avg loss: 0.000128 

Epoch 2
-------------------------------
loss: 0.000000  [    0/15611]
loss: 0.000002  [ 3200/15611]
loss: 0.000141  [ 6400/15611]
loss: 0.000105  [ 9600/15611]
loss: 0.000000  [12800/15611]
Test Error: 
 Accuracy: 100.0%, Avg loss: 0.000020 

Epoch 3
-------------------------------
loss: 0.000117  [    0/15611]
loss: 0.000000  [ 3200/15611]
loss: 0.000000  [ 6400/15611]
loss: 0.000030  [ 9600/15611]
loss: 0.000000  [12800/15611]
Test Error: 
 Accuracy: 100.0%, Avg loss: 0.000007 

Epoch 4
-------------------------------
loss: 0.000064  [    0/15611]
loss: 0.000000  [ 3200/15611]
loss: 0.000000  [ 6400/15611]
loss: 0.000000  [ 9600/15611]
loss: 0.000012  [12800/15611]
Test Error: 
 Accuracy: 100.0%, Avg loss: 0.000004 

Epoch 5
--------------------

In [11]:
# predict on the test data
model.eval()
predictions = []
with torch.no_grad():
    for X, y in test_dataloader:
        pred = model(X)
        predictions.extend(pred)

predictions = [1 if p > 0.5 else 0 for p in predictions]
test_df['predictions'] = predictions
test_df.to_csv('test_data_with_predictions.csv', index=True)
print("Predictions saved to test_data_with_predictions.csv")

Predictions saved to test_data_with_predictions.csv


In [12]:
# removing the downloaded files so that we can upload to the github repo
! rm -rf dstc2_traindev
! rm -rf dstc2_test
! rm dstc2_traindev.tar.gz
! rm dstc2_test.tar.gz
! rm glove.6B.zip
! rm -rf glove.6B
! rm train_data.csv
! rm test_data.csv
! rm test_data_with_predictions.csv