## Setup

### Import modules and initialize parameters for this notebook

In [None]:
import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker import get_execution_role, image_uris
import time
import logging
import boto3
import os
import json
# import matplotlib.pyplot as plt
# import matplotlib.image as mpimg
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
#%matplotlib inline

In [None]:
role = get_execution_role()
sess = sagemaker.Session()
sm = boto3.client("sagemaker")
s3_resource = boto3.resource('s3')

region_name = sagemaker.Session().boto_region_name

### Initialize bucket names and load models
*default_bucket* is the name of the bucket where the data is stored. Requires read permissions.

*output_bucket* is the name of the bucket where the model will be stored after training. Requires read/write permissions.

In [None]:
# default_bucket = "summer-team-bucket"
# output_bucket = "summer-team-bucket"
default_bucket = sess.default_bucket()
output_bucket = default_bucket
base_job_prefix = "cloudd-rf"

In [None]:
%%writefile ../code/team1_model.py

import torch
import torch.nn as nn
import torch.nn.functional as F

class Team1Model(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.num_classes = num_classes

        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(2, 16))
        self.pool = nn.MaxPool2d(kernel_size=(1, 2), stride=(1, 2))
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=64, kernel_size=(1, 8))
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(1, 8))
        #self.fc1 = nn.Linear(in_features=15360, out_features=1200)  # Fully Connected Layer
        self.fc1 = nn.LazyLinear(out_features=1200)
        self.fc2 = nn.Linear(in_features=1200, out_features=100)
        self.fc3 = nn.Linear(in_features=100, out_features=65)
        self.fc4 = nn.Linear(in_features=65, out_features=self.num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        # print(f'after conv1: {x.shape}')
        x = self.pool(x)
        # print(f'after pool: {x.shape}')
        x = F.relu(self.conv2(x))
        # print(f'after conv2: {x.shape}')
        x = self.pool(x)
        # print(f'after pool: {x.shape}')
        x = F.relu(self.conv3(x))
        # print(f'after conv3: {x.shape}')
        x = self.pool(x)
        # print(f'after pool: {x.shape}')
        x = x.reshape(x.shape[0], -1)
        # print(f'after reshaping: {x.shape}')
        x = F.relu(self.fc1(x))
        # print(f'fc1: {x.shape}')
        x = F.relu(self.fc2(x))
        # print(f'fc2: {x.shape}')
        x = F.relu(self.fc3(x))
        # print(f'fc3: {x.shape}')
        #x = F.relu(self.fc4(x))
        x = self.fc4(x)
        # print(f'fc3: {x.shape}')

        return x

In [None]:
%%writefile ../code/team2_model.py

import torch
import torch.nn as nn
import torch.nn.functional as F

class Team2Model(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv1d(2, 128, 8, dtype=torch.float32),
            nn.ReLU(),
            nn.MaxPool1d(4),
            nn.BatchNorm1d(128),
            nn.Conv1d(128, 256, 8, dtype=torch.float32),
            nn.ReLU(),
            nn.MaxPool1d(4),
            nn.BatchNorm1d(256),
            nn.Conv1d(256, 512, 8, dtype=torch.float32),
            nn.ReLU(),
            nn.MaxPool1d(4),
            nn.BatchNorm1d(512),
            nn.Conv1d(512, 1024, 8, dtype=torch.float32),
            nn.ReLU(),
            nn.MaxPool1d(4),
            nn.BatchNorm1d(1024),
        )

        self.fc1 = nn.LazyLinear(512, dtype=torch.float32)
        self.fc2 = nn.Linear(512, num_classes, dtype=torch.float32)
        # self.fc = nn.Sequential(
        #     nn.LazyLinear(512, dtype=torch.float32),
        #     nn.ReLU(),
        #     nn.Linear(512, num_classes, dtype=torch.float32),
        #     nn.ReLU(),
        # )

    def forward(self, x):
        x = x.squeeze(1)
        for layer in self.conv_layers:
            x = layer(x)
        x = x.flatten(1)
        x = F.relu(self.fc1(x))
        #x = F.relu(self.fc2(x))
        x = self.fc2(x)
        #return F.softmax(x, dim=1)
        return x

In [None]:
%%writefile ../code/team3_model.py

import torch
import torch.nn as nn
import torch.nn.functional as F

# From headley_modrec.py
class Team3Model(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.num_classes = num_classes

        self.conv1 = nn.Conv2d(1, 16, (2, 16))
        self.conv2 = nn.Conv2d(16, 8, (1, 8))
        self.conv3 = nn.Conv2d(8, 4, (1, 4))
        #self.fc1 = nn.Linear(3996, 512)
        self.fc1 = nn.LazyLinear(out_features=512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, self.num_classes)

        self.activation = {}

    def forward(self, x):
        x = F.tanh(self.conv1(x))
        x = F.tanh(self.conv2(x))
        x = F.tanh(self.conv3(x))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [None]:
%%writefile ../code/team4_model.py

import torch
import torch.nn as nn
import torch.nn.functional as F

class Team4Model(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.re1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=(1, 2), stride=(1, 2))

        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.re2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=(1, 2), stride=(1, 2))

        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.re3 = nn.ReLU()
        self.pool3 = nn.MaxPool2d(kernel_size=(1, 2), stride=(1, 2))

        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.re4 = nn.ReLU()
        self.pool4 = nn.MaxPool2d(kernel_size=(1, 2), stride=(1, 2))

        self.flat = nn.Flatten()
        self.drop1 = nn.Dropout(0.5)
        #self.fc1 = nn.Linear(128 * 256,512)  # I dont exactly know why it is 128x256, but I had to do some debugging and hardcode the required value
        self.fc1 = nn.LazyLinear(out_features=512)

        self.re5 = nn.ReLU()
        self.drop2 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, 256)
        self.re6 = nn.ReLU()
        self.fc3 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.pool1(self.re1(self.conv1(x)))
        x = self.pool2(self.re2(self.conv2(x)))
        x = self.pool3(self.re3(self.conv3(x)))
        x = self.pool4(self.re4(self.conv4(x)))
        x = self.flat(x)
        x = self.drop1(x)
        x = self.re5(self.fc1(x))
        x = self.drop2(x)
        x = self.re6(self.fc2(x))
        x = self.fc3(x)
        return x

## Build a SageMaker Training Job

### Add training script to source directory

In [3]:
%%writefile ../code/train.py

import argparse
import os
import glob
from dataclasses import dataclass
from pathlib import Path
from typing import Any

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd

from tqdm import tqdm

import warnings

from team1_model import Team1Model
from team2_model import Team2Model
from team3_model import Team3Model
from team4_model import Team4Model

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
MODELS_OBS_INT = 2048  # 2048 for both spring and summer datasets

# For spring dataset: T1 - 2048, T2 - 1024, T3 - 1024, T4 - 512
# For summer dataset: T1 - 2048, T2 - 1024, T3 - 512, T4 - 256
TEAM1_DATA_OBS_INT = 2048
TEAM2_DATA_OBS_INT = 1024
TEAM3_DATA_OBS_INT = 1024
TEAM4_DATA_OBS_INT = 512


@dataclass
class TrainingConfig:
    num_epochs: int
    criterion: Any
    optimizer: Any
    model_save_dir: str
    model_save_filename: str
    data_dir: str
    data_obs_int: int


# set the device on GPU is available otherwise CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("mps")

# Name, [modulation type, num symbols], class label
sig_types = [['2-ASK', ['ask', 2], 0],
             ['4-ASK', ['ask', 4], 1],
             ['8-ASK', ['ask', 8], 2],
             ['BPSK', ['psk', 2], 3],
             ['QPSK', ['psk', 4], 4],
             ['16-QAM', ['qam', 16], 5],
             ['Tone', ['constant'], 6],
             ['P-FMCW', ['p_fmcw'], 7]]
num_classes = len(sig_types)


def load_data(channel_path, batch_size, num_batches, num_train_examples, data_obs_int):
    training_data = np.zeros((num_train_examples, 1, 2, MODELS_OBS_INT), dtype=np.float32)
    training_labels = np.zeros((num_train_examples, num_classes), dtype=np.float32)

    last_index = 0
    for k in range(num_batches):
        # This is used if we have a labeldata folder that stores class labels
        label_df = pd.read_csv(f"{channel_path}/labeldata/example_{k + 1}.csv")

        iq_data = np.fromfile(f"{channel_path}/iqdata/example_{k + 1}.dat", np.csingle)
        iq_data = np.reshape(iq_data, (-1, data_obs_int))  # Turn the IQ data into chunks of (chunk size) x (data_obs_int)
        for j in range(iq_data.shape[0]):
            iq_array_norm = iq_data[j][:] / np.max(np.abs(iq_data[j][:]))  # Normalize the observation
            iq_array = np.vstack((iq_array_norm.real, iq_array_norm.imag))  # Separate into 2 subarrays - 1 with only real (in-phase), the other only imaginary (quadrature)

            # Pad the iq array with zeros to meet the observation length requirement
            # This is needed because the CNN models have a fixed input size
            iq_array = np.pad(iq_array, ((0, 0), (0, MODELS_OBS_INT - iq_array[0].size)), mode='constant', constant_values=0)

            training_data[last_index, 0, :, :] = iq_array
            training_labels[last_index, label_df.iloc[j]] = 1.0
            last_index += 1

    return torch.utils.data.DataLoader([[training_data[i], training_labels[i]] for i in range(num_train_examples)], batch_size=batch_size, shuffle=True)


def train(model, num_epochs, criterion, optim, scheduler, dataloader):
    # Put the model in training mode
    model.to(device)
    model.train()

    for _ in tqdm(range(num_epochs)):
        running_loss = 0.0

        # Training step
        for idx, (data, labels) in enumerate(dataloader):
            data = data.to(device)
            labels = labels.to(device)

            outputs = model(data)
            loss = criterion(outputs, labels)

            optim.zero_grad()
            loss.backward()
            optim.step()
            if scheduler is not None:
                scheduler.step()

            running_loss += loss.item()

            # FOR TESTING ONLY: Break after 10 steps
            #if idx == 10:
            #    break

    return model

def setup_training(model, config):
    print(f'Loading dataset at {config.data_dir}')
    train_dir = config.data_dir #os.path.join(config.data_dir, 'train')
    train_iq_files = os.path.join(train_dir, "iqdata", "example_*.dat")
    file_list = glob.glob(train_iq_files)
    num_batches = len(file_list)
    num_train_examples = num_batches * args.chunk_size
    train_data = load_data(train_dir, args.batch_size, num_batches, num_train_examples, config.data_obs_int)

    print('Training model')
    model = train(model, config.num_epochs, config.criterion, config.optimizer, None, train_data)

    save_model_artifacts(model, config.model_save_dir, config.model_save_filename)


def save_model_artifacts(model, model_dir: str, model_name: str):
    """
    Saves a PyTorch model to disk
    :param model: The PyTorch model to be saved
    :param model_dir: The directory to save the model in. Any missing
    parent directories will be automatically created.
    :param model_name: The name of the model file
    :return: None
    """

    Path(model_dir).mkdir(parents=True, exist_ok=True)
    filepath = os.path.join(model_dir, model_name)
    print(f'Saving model to {filepath}...')
    torch.save(model.state_dict(), filepath)
    print(" ")
    print('Model has been saved')


def parse_args():
    """
    Parses and loads the command-line arguments sent to the script. These
    will be sent by SageMaker when it launches the training container
    :return:
    """
    print('Parsing command-line arguments...')
    parser = argparse.ArgumentParser()

    # Observation length of the spectrum for each example.
    parser.add_argument("--obs-int", type=int, default=2048)
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--s3_checkpoint_path', type=str, default='')
    parser.add_argument('--chunk-size', type=int, default=50)

    # Data directories
    #parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--team_data_dir', type=str, default=os.environ.get('SM_CHANNEL_TEAM_DATA_DIR'))
    # parser.add_argument('--team2_data_dir', type=str, default=os.environ.get('SM_CHANNEL_TEAM2_DATA_DIR'))
    # parser.add_argument('--team3_data_dir', type=str, default=os.environ.get('SM_CHANNEL_TEAM3_DATA_DIR'))
    # parser.add_argument('--team4_data_dir', type=str, default=os.environ.get('SM_CHANNEL_TEAM4_DATA_DIR'))
    
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))

    # Checkpoint info
    parser.add_argument('--checkpoint_enabled', type=str, default='False')
    parser.add_argument('--checkpoint_path', type=str, default='/opt/ml/checkpoints')

    print('Completed parsing command-line arguments.')

    return parser.parse_known_args()


if __name__ == '__main__':
    print('Executing the main() function...')
    # Parse command-line arguments
    args, _ = parse_args()
    
    if (args.team_data_dir is None): # or (args.team2_data_dir is None) or (args.team3_data_dir is None) or (args.team4_data_dir is None):
        raise ValueError("A data directory argument wasn't passed in correctly")

    # If running on SageMaker
    #model_dir = os.path.join('.', 'model')  # If running locally

    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    # Every model will have the same hyperparameters
    # We need to check model performance between these and the hyperparameters chosen by 
    # the individual teams (commented out below)
    learning_rate = args.lr
    num_epochs = args.epochs
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam

    # Team 1 model
    print('Starting team 1 model training')
    t1_model = Team1Model(num_classes)
    t1_config = TrainingConfig(num_epochs=num_epochs, criterion=criterion,
                               optimizer=optimizer(t1_model.parameters(), lr=learning_rate), model_save_dir=args.model_dir,
                               model_save_filename='team1_model.pt', data_dir=f'{args.team_data_dir}/1', data_obs_int=TEAM1_DATA_OBS_INT)
    setup_training(t1_model, t1_config)
    # learning_rate_t1 = 0.001
    # num_epochs_t1 = 10
    # criterion_t1 = torch.nn.CrossEntropyLoss()
    # optimizer_t1 = torch.optim.SGD(modelt1.parameters(), lr=args.lr)
    # optimizer_t1 = optimizer(t1_model.parameters(), lr=learning_rate)
    # t1_model = train(t1_model, num_epochs, criterion, optimizer_t1, None, train_data)

    # Team 2 model
    print('Starting team 2 model training')
    t2_model = Team2Model(num_classes)
    t2_config = TrainingConfig(num_epochs=num_epochs, criterion=criterion,
                               optimizer=optimizer(t2_model.parameters(), lr=learning_rate), model_save_dir=args.model_dir,
                               model_save_filename='team2_model.pt', data_dir=f'{args.team_data_dir}/2', data_obs_int=TEAM2_DATA_OBS_INT)
    setup_training(t2_model, t2_config)
    # learning_rate_t2 = 0.01
    # num_epochs_t2 = 200
    # criterion_t2 = torch.nn.CrossEntropyLoss()
    # optimizer_t2 = torch.optim.SGD(modelt2.parameters(), lr=learning_rate)
    # optimizer_t2 = optimizer(modelt2.parameters(), lr=learning_rate)
    # modelt2 = train(modelt2, num_epochs, criterion, optimizer_t2, None, train_data)
    # save_model_artifacts(modelt2, model_dir, 'modelt2.pt')

    # Team 3 model
    print('Starting team 3 model training')
    t3_model = Team3Model(num_classes)
    t3_config = TrainingConfig(num_epochs=num_epochs, criterion=criterion,
                               optimizer=optimizer(t3_model.parameters(), lr=learning_rate), model_save_dir=args.model_dir,
                               model_save_filename='team3_model.pt', data_dir=f'{args.team_data_dir}/3', data_obs_int=TEAM3_DATA_OBS_INT)
    setup_training(t3_model, t3_config)
    # # learning_rate_t3 = 0.001
    # # num_epochs_t3 = 10
    # # criterion_t3 = torch.nn.CrossEntropyLoss()
    # # optimizer_t3 = torch.optim.SGD(modelt3.parameters(), lr=learning_rate)
    # optimizer_t3 = optimizer(modelt3.parameters(), lr=learning_rate)
    # modelt3 = train(modelt3, num_epochs, criterion, optimizer_t3, None, train_data)
    # save_model_artifacts(modelt3, model_dir, 'modelt3.pt')

    # Team 4 model
    print('Starting team 4 model training')
    t4_model = Team4Model(num_classes)
    t4_config = TrainingConfig(num_epochs=num_epochs, criterion=criterion,
                               optimizer=optimizer(t4_model.parameters(), lr=learning_rate), model_save_dir=args.model_dir,
                               model_save_filename='team4_model.pt', data_dir=f'{args.team_data_dir}/4', data_obs_int=TEAM4_DATA_OBS_INT)
    setup_training(t4_model, t4_config)
    # # learning_rate_t4 = 0.001
    # # num_epochs_t4 = 10
    # # criterion_t4 = torch.nn.CrossEntropyLoss()
    # # optimizer_t4 = torch.optim.Adam(modelt4.parameters(), learning_rate)
    # optimizer_t4 = optimizer(modelt4.parameters(), lr=learning_rate)
    # modelt4 = train(modelt4, num_epochs, criterion, optimizer_t4, None, train_data)
    # save_model_artifacts(modelt4, model_dir, 'modelt4.pt')


Overwriting ../code/train.py


## Configure Training Estimator

Note: A trailing forward slash is recommended to define a channel corresponding to a folder. For example, the s3://my-bucket/train-01/ channel for the train-01 folder. Without the trailing forward slash, the channel would be ambiguous if there existed another folder s3://my-bucket/train-011/ or file s3://my-bucket/train-01.txt/.

In [None]:
base_job_prefix = 'cloudd-rf'
output_prefix = f'{base_job_prefix}/training'

timestamp = str(time.time()).split('.')[0]
code_location = f"s3://{output_bucket}/{output_prefix}/{timestamp}/code"

instance_count = 1

# Be sure to update the chunk-size and obs-int hyperparameters so the IQ data gets parsed correctly
hyperparameters = {
                    "lr": 0.001,
                    "batch_size": 16,
                    "epochs": 10,
                    "obs-int": 2048,
                    "chunk-size": 5000
                }
   
metric_definitions = [{'Name': 'loss',      'Regex': "'loss': ([0-9\\.]+)"},
                      {'Name': 'recall',       'Regex': "'recall': ([0-9\\.]+)"},
                      {'Name': 'map50',  'Regex': "'map50': ([0-9\\.]+)"},
                      {'Name': 'map',   'Regex': "'map': ([0-9\\.]+)"}]


distributions = {'parameter_server': {'enabled': False}}
DISTRIBUTION_MODE = 'FullyReplicated'
train_script = 'train.py'
instance_type  = 'ml.g5.xlarge'

# Set the training script related parameters
train_script_dir = '../code'
container_log_level = logging.INFO

# Location where the trained model will be stored locally in the container before being uploaded to S3
model_local_dir = '/opt/ml/model'
model_path = f"s3://{output_bucket}/{output_prefix}"

# The data folders within the S3 bucket for each team. Each folder should contain "train", "test", and "validation" subfolders.
team1_data_in = TrainingInput(s3_data=f"s3://{default_bucket}/cloudd-rf/preprocess/outputs/1728916913/train/1/", distribution=DISTRIBUTION_MODE, s3_data_type='S3Prefix', input_mode='FastFile')
team2_data_in = TrainingInput(s3_data=f"s3://{default_bucket}/cloudd-rf/preprocess/outputs/1728916913/train/2/", distribution=DISTRIBUTION_MODE, s3_data_type='S3Prefix', input_mode='FastFile')
team3_data_in = TrainingInput(s3_data=f"s3://{default_bucket}/cloudd-rf/preprocess/outputs/1728916913/train/3/", distribution=DISTRIBUTION_MODE, s3_data_type='S3Prefix', input_mode='FastFile')
team4_data_in = TrainingInput(s3_data=f"s3://{default_bucket}/cloudd-rf/preprocess/outputs/1728916913/train/4/", distribution=DISTRIBUTION_MODE, s3_data_type='S3Prefix', input_mode='FastFile')

inputs = {'team1_data_dir': team1_data_in,
          'team2_data_dir': team2_data_in,
          'team3_data_dir': team3_data_in,
          'team4_data_dir': team4_data_in
         }

In [None]:
image_uri = image_uris.retrieve(framework='pytorch',region='us-east-1',version='1.13.1',py_version='py39',image_scope='training', instance_type=instance_type)

In [None]:
estimator = Estimator(  image_uri,
                        entry_point=train_script,
                        source_dir=train_script_dir,
                        output_path=model_path,
                        distribution=distributions,
                        instance_type=instance_type,
                        instance_count=instance_count,
                        hyperparameters=hyperparameters,
                        # metric_definitions=metric_definitions,
                        role=role,
                        code_location=code_location,
                        base_job_name=f'{base_job_prefix}-training',
                        container_log_level=container_log_level,
                        enable_sagemaker_metrics=False,
                        input_mode="FastFile",
                        script_mode=True,
                        disable_output_compression=True
                     )

In [None]:
estimator.fit(inputs=inputs, wait=False)

In [None]:
estimator.logs()

## Get Model Artifacts

In [None]:
model_path = estimator.model_data

print(f"Model artifact files are uploaded here: {model_path} ========")

In [None]:
!aws s3 cp {model_path} models.tar.gz

### Untar the models

In [None]:
!tar -xzvf models.tar.gz ../data/summer_models/