# Assignment 2

**Author:** Tingjun Yuan

## Preparation

Import necessary libraries:

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import os
import random

## Task 1

Prepare the model for the given dataset and evaluate it.

### Read the dataset from the files

**NOTE:** The following code assumes that `dataset.zip` from the assignment description page is unarchived as the directory `dataset`.

In [None]:
def build_data_list(path_prefix):
    data_mapping = {}
    for filename in os.listdir(path_prefix):
        scene_id = filename[:filename.find('.')]
        suffix = filename[filename.find('.')+1:]
        pair = data_mapping.setdefault(scene_id, {})
        match suffix:
            case 'edges':
                edges = pd.read_csv(
                    os.path.join(path_prefix, filename),
                    header=None,
                    names=['target', 'source']
                )
                edges = edges[(edges['target'] != -1) & (edges['source'] != -1)]
                edges['target'] = edges['target'].astype(int)
                edges['source'] = edges['source'].astype(int)
                pair['edges'] = edges
            case 'nodes':
                nodes = pd.read_csv(
                    os.path.join(path_prefix, filename),
                    header=None,
                    names=['node_id', 'current_x', 'current_y', 'previous_x',
                        'previous_y', 'future_x', 'future_y'],
                    na_values='_'
                )
                nodes['node_id'] = nodes['node_id'].astype(int)
                pair['nodes'] = nodes
            case _:
                raise AssertionError(f'Unexpected suffix {suffix}')
    
    # For simplicity, refine the scene id into a 0-based index. We can then store
    # the edges and nodes into a list:
    data_list = [pair for pair in data_mapping.values()]
    return data_list

data_list = build_data_list(path_prefix="dataset")

In [13]:
data_list[0]['edges']

Unnamed: 0,target,source
0,19585800,19590700
1,19591900,19592201
2,19591900,19595300
3,19591900,19595800
4,19592201,19595300
5,19592201,19595800
6,19592201,20000100
7,19595300,19595800
8,19595800,20000100
9,20000200,20000300


In [14]:
data_list[0]['nodes']

Unnamed: 0,node_id,current_x,current_y,previous_x,previous_y,future_x,future_y
0,19502500,40055.0,-16746.0,40071.0,-16621.0,40533.0,-17117.0
1,19585800,25369.0,-15783.0,24661.0,-15352.0,26397.0,-16549.0
2,19590700,25595.0,-15126.0,24826.0,-14669.0,26641.0,-15512.0
3,19591900,19436.0,-12565.0,18317.0,-12234.0,20411.0,-13030.0
4,19592201,21689.0,-13790.0,20970.0,-13362.0,22452.0,-14398.0
5,19592800,9014.0,1389.0,9283.0,1325.0,8961.0,1629.0
6,19595200,32100.0,-17434.0,30479.0,-16935.0,33987.0,-17862.0
7,19595300,19707.0,-12088.0,18260.0,-11553.0,20630.0,-12513.0
8,19595800,20630.0,-13866.0,19906.0,-13056.0,21536.0,-14176.0
9,20000100,23159.0,-13333.0,22106.0,-13541.0,,


### Split the dataset

I would like to split the dataset into 70/30 train/test sets.

In [None]:
def split_dataset(data_list, random_seed=1231, train_set_size=0.7):
    random.seed(random_seed)

    # Step 1: Shuffle scene indices
    shuffled_index = [i for i in range(len(data_list))]
    random.shuffle(shuffled_index)

    # Step 2: Split into training (default 70%) and test sets
    split_idx = int(train_set_size * len(data_list))
    train_ids = shuffled_index[:split_idx]
    test_ids = shuffled_index[split_idx:]

    # Step 3: Create two new lists for training and test sets
    train_set = [data_list[i] for i in train_ids]
    test_set = [data_list[i] for i in test_ids]

    return (train_set, test_set)

train_set, test_set = split_dataset(data_list)

print(f"The train dataset has {len(train_set)} scenes")
print(f"The test dataset has {len(test_set)} scenes")

The train dataset has 135 scenes
The test dataset has 58 scenes


### Prepare the graph data

In [None]:
def prepare_graph(dataset):
    
    pass  # TODO