#### Learning of Structured Data

In [None]:
# @title Imported and extracted the Data

# Install Kaggle library
!pip install kaggle

# Install colab and mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Kaggle API key to the appropriate directory
!mkdir -p ~/.kaggle
!cp /content/kaggle.json  ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json

# Download the dataset
!kaggle competitions download -c learning-of-structured-data-fhws-ws2324

# Unzip the dataset
!unzip -q learning-of-structured-data-fhws-ws2324.zip

Mounted at /content/drive
Downloading learning-of-structured-data-fhws-ws2324.zip to /content
 91% 89.0M/98.3M [00:00<00:00, 161MB/s]
100% 98.3M/98.3M [00:00<00:00, 140MB/s]


In [None]:
# @title Added  Necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import torch
import torch.nn as nn
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# @title Loading the dataset
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# @title Data preprocessing

to_use = [] # columns to be used. skip the confidence score. and angles
for i in range(0,75,3):
    to_use.append(i)
    to_use.append(i+1)

data_list = []

mapping = {"boxing" :0,
"drums" : 1,
"guitar" : 2,
"rowing" : 3,
"violin" :4}

training_files =  os.listdir('/content/train/train/')

for file in training_files:
    d = pd.read_csv(f'/content/train/train/{file}',usecols=to_use)
    d = d.fillna(0)
    data_list.append(d.to_numpy()[:int(len(d)/2)]) # remove half of datasets, 'cause they are repeating, but additionally they have angles and confidience score

# labels
labels = []
for name in training_files:
    lab = mapping[name[9:-4]]
    labels.append(lab)

len(data_list),len(labels)

(1167, 1167)

In [None]:
# @title Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_list, labels, test_size=0.33, random_state=42)

In [None]:
# @title Creating Windows
window_size = 50
stride = 1

#training
tr_windows = []
tr_window_labels = []


for f,l in zip(X_train,y_train):
    #print(f.shape)
    for j in range(f.shape[0]-window_size):
        tr_windows.append(f[j:j+50,:])
        tr_window_labels.append(l)


#validation
te_windows = []
te_window_labels = []


for f,l in zip(X_test,y_test):
    #print(f.shape)
    for j in range(f.shape[0]-window_size):
        te_windows.append(f[j:j+50,:])
        te_window_labels.append(l)


len(tr_windows),len(tr_window_labels),len(te_windows),len(te_window_labels)

(179037, 179037, 88157, 88157)

In [None]:
# @title Converted windows  into the  Tensors
tr_windows = torch.tensor(np.array(tr_windows)).permute(0,2,1)
tr_window_labels = torch.tensor(np.array(tr_window_labels))

te_windows = torch.tensor(np.array(te_windows)).permute(0,2,1)
te_window_labels = torch.tensor(np.array(te_window_labels))

tr_windows.shape,tr_window_labels.shape,te_windows.shape,te_window_labels.shape

(torch.Size([179037, 50, 50]),
 torch.Size([179037]),
 torch.Size([88157, 50, 50]),
 torch.Size([88157]))

In [None]:
# @title Selected avalible device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# @title CNN One  Dimensional Architecture
import torch
import torch.nn as nn

class ImprovedCNN1DModel(nn.Module):
    def __init__(self, num_channels, window_size):
        super(ImprovedCNN1DModel, self).__init__()

        self.cnn_layers = nn.Sequential(
            nn.Conv1d(num_channels, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2)
        )

        # Calculate the final size after three CNN layers and pooling
        final_size = window_size // 2 // 2 // 2

        self.fc_layers = nn.Sequential(
            nn.Linear(64 * final_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 5)  # Assuming a regression task, change as needed
        )

    def forward(self, x):
        x = self.cnn_layers(x)

        # Reshape for fully connected layer
        x = x.view(x.size(0), -1)

        x = self.fc_layers(x)
        return x

# Example instantiation with 50 channels and window size of 50
num_channels = 50
window_size = 50
model = ImprovedCNN1DModel(num_channels=num_channels, window_size=window_size).to(device)

# Print the improved model architecture
print(model)

ImprovedCNN1DModel(
  (cnn_layers): Sequential(
    (0): Conv1d(50, 16, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv1d(16, 32, kernel_size=(3,), stride=(1,), padding=(1,))
    (4): ReLU()
    (5): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv1d(32, 64, kernel_size=(3,), stride=(1,), padding=(1,))
    (7): ReLU()
    (8): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc_layers): Sequential(
    (0): Linear(in_features=384, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=5, bias=True)
  )
)


In [None]:
# @title Training the  Model
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

l = []
num_epochs = 2
batch_size = 32
for epoch in range(num_epochs):
    model.train()
    indexes = np.random.permutation(tr_windows.shape[0])
    for i in range(0,indexes.shape[0],batch_size):
        #print(i,indexes[i:i+batch_size])
        inputs, lab = tr_windows[[indexes[i:i+batch_size]]].to(device), tr_window_labels[[indexes[i:i+batch_size]]].to(device)
        #print(inputs.shape,labels)
        optimizer.zero_grad()
        outputs = model(inputs.float())
        loss = criterion(outputs, lab.long())
        loss.backward()
        optimizer.step()
        l.append(loss.item())
    print(sum(l)/len(l))

0.2047954325811393
0.1369316673727676


In [None]:
# @title Validation of a Model
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, lab in zip(te_windows,te_window_labels):
        inputs, lab = inputs.to(device), lab.to(device)
        #print(lab)
        outputs = model(inputs.reshape((1,50,50)).float())
        _, predicted = torch.max(outputs.data, 1)
        total +=   1
        correct += (predicted == lab).sum().item()

    accuracy = correct / total
    print(f" Validation Accuracy: {accuracy:.4f}")


 Validation Accuracy: 0.9537


In [None]:
# @title Kaggle predictions
n = 305
stride=window_size
predictions = []
for i in range(n):
    clab = []
    file = pd.read_csv(f'/content/test/test/{i}.csv',usecols=to_use).fillna(0).to_numpy()
    for j in range(0,file.shape[0]-window_size,stride):
        inp = torch.tensor(file[j:j+50]).unsqueeze(0).permute(0,2,1).to(device)
        #print(inp.shape)
        out = torch.argmax(model(inp.float()))
        clab.append(out.item())
    unique, counts = np.unique(clab, return_counts=True)
    fpred = unique[counts.argmax()]
    predictions.append(fpred)

In [None]:
# @title Writing the Solution in **Sumbission file**.
sheet = {'id':range(305),'action':predictions}
df = pd.DataFrame(sheet)
df.to_csv('Submission_file.csv',index=False)