In [2]:
# This is the template for the submission. You can develop your algorithm in a regular Python script and copy the code here for submission.

# TEAM NAME ON KAGGLE
# "EXAMPLE_GROUP"

# GROUP NUMBER
# "group_XX"

# TEAM MEMBERS (E-MAIL, LEGI, KAGGLE USERNAME):
# "examplestudent1@ethz.ch", "12-345-678", "eXampl3stdNtone" 
# "examplestudent2@ethz.ch", "12-345-679", "xXexamplestudent2Xx"
# "examplestudent3@ethz.ch", "12-345-670", "mhealth_student_98"

In [26]:
from os import listdir
from os.path import isfile, join
import re

import pandas as pd
import numpy as np

# You may change the mhealth_activity module but your algorithm must support the original version
from mhealth_activity import Recording
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

# For interactive graphs
# %matplotlib widget

In [4]:

# Get the path for all test traces
'''
dir_traces_test = 'data/test'
filenames_test = [join(dir_traces_test, f) for f in listdir(dir_traces_test) if isfile(join(dir_traces_test, f))]
filenames_test.sort()
recordings_test = []
for fn in filenames_test:
    rec = Recording(fn)
    match = re.search(r'(\d{3})\.pkl$', fn)
    if match:
        id = int(match.group(1))
        rec.id = id
    else:
        raise ValueError(f'Filename {fn} does not match expected format')
    recordings_test.append(rec)
'''
    
    
dir_traces_train = 'data/train'
filenames_train = [join(dir_traces_train, f) for f in listdir(dir_traces_train) if isfile(join(dir_traces_train, f))]
filenames_train.sort()

alts = []
activities = []
axs = []
ays = []
azs = []
for fn in filenames_train:
    rec = Recording(fn)
    alts.append(rec.data['altitude'].values)
    activities.append(rec.labels['activities'])
    axs.append(rec.data['ax'].values)
    ays.append(rec.data['ay'].values)
    azs.append(rec.data['az'].values)

In [24]:
from scipy.signal import argrelextrema, find_peaks
def centered_moving_average(data, window_size):
    ret = np.cumsum(data, dtype=float)
    ret[window_size:] = ret[window_size:] - ret[:-window_size]
    return ret[window_size - 1:] / window_size
def plot_array(arr):
    plt.figure(figsize=(25, 3))
    plt.plot(arr)
    plt.show()

def windowed_peak_detection(data, window_size):
    # Initialize an empty list to store the peaks
    peaks = []

    # Divide the data into windows
    for i in range(0, len(data), window_size):
        window = data[i:i + window_size]

        # Compute the relative maxima of the window
        window_peaks = argrelextrema(window, np.greater)

        # Add the indices of the peaks to the list
        peaks.extend(window_peaks[0] + i)

    return np.array(peaks)

def get_steps_from_peaks(data, peaks, threshold=1.25):
    steps=0
    for peak in peaks:
        if data[peak]>threshold:
            steps+=1
    return steps

def get_steps(data, window_size=80, threshold=1.25):
    peaks, _ = find_peaks(data, height=1.25, distance=80)
    steps = get_steps_from_peaks(data, peaks, threshold)
    return steps

with open('indices.txt', 'r') as f:
    indices = [line.rstrip() for line in f]
    

def pad_arrays(arr_list):
    max_len = max(len(arr) for arr in arr_list)
    return [np.pad(arr, (0, max_len - len(arr)), 'constant') for arr in arr_list]

def pad_arrays_len(arr_list, pad_length):
    return [np.pad(arr, (0, pad_length - len(arr)), 'constant') if len(arr) < pad_length else arr for arr in arr_list]

def create_vector(lst):
    return [1 if i in lst else 0 for i in range(4)]

import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

def create_dataloaders(X, y, batch_size=32, test_size=0.2):
    # Convert X and y into PyTorch tensors
    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.float32)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=test_size)

    # Create TensorDatasets for the training and testing sets
    train_data = TensorDataset(X_train, y_train)
    test_data = TensorDataset(X_test, y_test)

    # Create DataLoaders for the training and testing sets
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

    return train_loader, test_loader

In [25]:
#162096
padded_alts = pad_arrays_len(alts, 162096)
padded_axs = pad_arrays_len(axs, 162096)
padded_ays = pad_arrays_len(ays, 162096)
padded_azs = pad_arrays_len(azs, 162096)

X = []

for i in range(len(padded_alts)):
    X.append([padded_alts[i], padded_axs[i], padded_ays[i], padded_azs[i]])
    
X = np.array(X)
y = []
for i in range(len(activities)):
    y.append(create_vector(activities[i]))
y = np.array(y)

train_Dataloader, test_Dataloader = create_dataloaders(X, y, batch_size=32, test_size=0.2)


In [None]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv1d(4, 16, 200)
        self.pool = nn.MaxPool1d(50)
        self.conv2 = nn.Conv1d(16, 32, 50)
        self.fc1 = nn.Linear(32 * 40444, 1024)
        
        self.fc2 = nn.Linear(128, 4)

    def forward(self, x):
        x = self.pool(nn.functional.relu(self.conv1(x)))
        x = self.pool(nn.functional.relu(self.conv2(x)))
        x = x.view(-1, 32 * 40444)
        x = nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [7]:
# Loop through all filenames to process recordings
filenames_test = []
submission = []
for filename in filenames_test:
    recording = Recording(filename)
    
    # Assumes filename format ends with a three-digit ID before ".pkl"
    match = re.search(r'(\d{3})\.pkl$', filename)
    if match:
        id = int(match.group(1))
        recording.id = id
    else:
        raise ValueError(f'Filename {filename} does not match expected format')

    # Placeholder for the algorithm to process the recording
    # Implement the logic to infer watch location, path index, step count,
    # and activities (standing, walking, running, cycling) here.
    # Ensure your algorithm is tolerant to missing data and does not crash
    # when optional smartphone data traces are missing.

    path_idx = 0  # Integer, path in {0, 1, 2, 3, 4}
    watch_loc = 0  # Integer, 0: left wrist, 1: belt, 2: right ankle
    standing = False  # Boolean, True if participant was standing still throughout the recording
    walking = False  # Boolean, True if participant was walking throughout the recording
    running = False  # Boolean, True if participant was running throughout the recording
    cycling = False  # Boolean, True if participant was cycling throughout the recording
    step_count = 0  # Integer, number of steps, must be provided for each recording

    predictions = {
        'Id': id, 
        'watch_loc': watch_loc, 
        'path_idx': path_idx,
        'standing': standing,
        'walking': walking,
        'running': running,
        'cycling': cycling,
        'step_count': step_count
        }

    submission.append(predictions)

In [8]:
# Write the predicted values into a .csv file to then upload the .csv file to Kaggle
# When cross-checking the .csv file on your computer, we recommend using a text editor and NOT excel so that the results are displayed correctly
# IMPORTANT: Do NOT change the name of the columns of the .csv file ("Id", "watch_loc", "path_idx", "standing", "walking", "running", "cycling", "step_count")
submission_df = pd.DataFrame(submission, columns=['Id', 'watch_loc', 'path_idx', 'standing', 'walking', 'running', 'cycling', 'step_count'])
submission_df.to_csv('submission.csv', index=False)

In [9]:
X = np.array(pad_arrays(alts))
y = np.array(path_idxs)

NameError: name 'path_idxs' is not defined

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
model = XGBClassifier(objective='multi:softmax', num_class=5)

# Fit the model with the training data
model.fit(X_train, y_train)

# Make predictions with the testing data
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = balanced_accuracy_score(y_test, y_pred)
print(y_pred)
print("B_Accuracy: %.2f%%" % (accuracy * 100.0))

[3 0 1 4 4 1 2 3 4 4 3 2 3 2 0 1 4 3 1 3 2 4 0 2 3 4 0 2 3 2 3 1 4 2 3 0 2
 2 2 4 3 4 3 2 1 3 2 4 3 1 1 4 1 0 1 0 4 3 1 2 0 4 4 3 1 4 0 4 4 0 0 2 2 1
 1 4 4 0 4 2]
B_Accuracy: 66.13%
