In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split


## Downloading the dataset
- This dataset is hosted on [GDrive](http://bit.ly/2VGEeWN)
- Navigate to the data folder and download the folder
- Extract the downloaded zip file and move the 40 folders in it to ```Swim``` folder
- run the notebook and find the processed data in the ```data``` folder

In [2]:


# Function to process each file
def process_file(file_path, n):
    data = pd.read_csv(file_path)
    
    # Extract relevant columns
    acc_columns = ['ACC_0', 'ACC_1', 'ACC_2']
    label_column = 'label'
    
    # Map label numbers to actual style names
    label_map = {0: 'transition', 1: 'freestyle', 2: 'breaststroke', 3: 'backstroke', 4: 'butterfly', 5: 'transition'}
    data['label'] = data['label'].map(label_map)
    
    # List to store processed rows
    rows = []
    
    # Group data by continuous labels
    current_label = None
    current_group = []
    
    for i, row in data.iterrows():
        label = row['label']
        
        if label != current_label:
            if current_group:
                rows.extend(process_group(current_group, n, current_label, acc_columns))
            current_group = []
            current_label = label
            
        current_group.append(row)
    
    # Process the last group
    if current_group:
        rows.extend(process_group(current_group, n, current_label, acc_columns))
    
    return rows

# Function to process a group of continuous label data
def process_group(group, n, label, acc_columns):
    rows = []
    num_entries = len(group)
    
    # Convert the group (list of rows) into a DataFrame
    group_df = pd.DataFrame(group)
    
    # Split into chunks of size n
    for i in range(0, num_entries, n):
        chunk = group_df.iloc[i:i+n]
        
        if len(chunk) < n:
            # Handle the case where the last chunk is smaller than n
            previous_chunk = group_df.iloc[i-n:i] if i >= n else group_df.iloc[:i]
            chunk = pd.concat([previous_chunk, chunk]).tail(n)  # Get the last n readings
        
        acc_values = chunk[acc_columns].round(0).astype(int)
        input_str = f"X: {list(acc_values['ACC_0'])}\nY: {list(acc_values['ACC_1'])}\nZ: {list(acc_values['ACC_2'])}"
        rows.append([input_str, label])
    
    return rows

# Directory paths and parameters
data_dir = "Swim"
n = 100

# List to hold all data
all_data = []

# Process each swimmer folder
for swimmer in os.listdir(data_dir):
    swimmer_path = os.path.join(data_dir, swimmer)
    if os.path.isdir(swimmer_path):
        for file in os.listdir(swimmer_path):
            file_path = os.path.join(swimmer_path, file)
            if file.endswith(".csv"):
                all_data.extend(process_file(file_path, n))

# Convert to DataFrame
df = pd.DataFrame(all_data, columns=['Input', 'Label'])

In [None]:

df = pd.DataFrame(all_data, columns=['input', 'output'])
df = df.dropna()  # Drop rows with NaN values


# Split into Train, Test, Val
train, test = train_test_split(df, test_size=0.3, stratify=df['Label'])
val, test = train_test_split(test, test_size=2/3, stratify=test['Label'])

# Save to CSV files
train.to_csv('data/train.csv', index=False)
val.to_csv('data/validation.csv', index=False)
test.to_csv('data/test.csv', index=False)

print("Data preprocessing completed!")


Data preprocessing completed!
