In [1]:
import os 
import pandas as pd
from sklearn.preprocessing import StandardScaler
import warnings
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import re
warnings.filterwarnings('ignore')

In [2]:

DATA_DIR = Path(r"./OpportunityUCIDataset/dataset")  # change this to your .dat directory
OUT_DIR  = Path(r".")
def get_index(string):
    """
    Helper function to extract the index from a line.
    get the index of the first alphabet in the string after the 9th index

    Args:
    line (str): Input line containing index information.

    Returns:
    int: Extracted index.
    """

    for i in range(9,len(string)):
        if string[i].isalpha():
            return i
    return -1
names_file = DATA_DIR.parent / "column_names.txt"   # adjust if different name
with open(os.path.join(DATA_DIR, "column_names.txt"), 'r') as f:
    lines = f.read().splitlines()
    columns = []
    for line in lines:
        if 'Column' in line:
            # Extract column names and append to the list
            columns.append(line[get_index(line):].split(";")[0])

In [3]:
READ_KW = dict(
    sep=r"\s+",
    header=None,
    engine="python",
    na_values=["NaN", "nan"]
)

all_files = sorted(DATA_DIR.glob("S*-*.dat"))\

test_files1  = [f for f in all_files if f.name.startswith("S1")]
test_files2  = [f for f in all_files if f.name.startswith("S2")]
test_files3  = [f for f in all_files if f.name.startswith("S3")]
test_files4  = [f for f in all_files if f.name.startswith("S4")]
# Determine number of columns
n_cols = pd.read_csv(test_files1[0], nrows=1, **READ_KW).shape[1]

test_s1  = pd.concat([pd.read_csv(f, **READ_KW) for f in test_files1], ignore_index=True)
test_s2  = pd.concat([pd.read_csv(f, **READ_KW) for f in test_files2], ignore_index=True)
test_s3  = pd.concat([pd.read_csv(f, **READ_KW) for f in test_files3], ignore_index=True)
test_s4  = pd.concat([pd.read_csv(f, **READ_KW) for f in test_files4], ignore_index=True)

# Sanity check
assert test_s1.shape[1] == n_cols
assert test_s2.shape[1] == n_cols
assert test_s3.shape[1] == n_cols
assert test_s4.shape[1] == n_cols

# Save
train_path = OUT_DIR / "train.csv"
test_path  = OUT_DIR / "test.csv"
test_s1.columns = columns
test_s2.columns = columns
test_s3.columns = columns
test_s4.columns = columns
# train_df.to_csv(train_path, index=False, header=False)
# test_df.to_csv(test_path,  index=False, header=False)

In [12]:
def uniquify_columns(columns):
    seen = {}
    out = []
    for c in columns:
        if c not in seen:
            seen[c] = 0
            out.append(c)
        else:
            seen[c] += 1
            out.append(f"{c}__dup{seen[c]}")
    return out

if test_s1.columns.duplicated().any():
    df = test_s1.copy()
    test_s1.columns = uniquify_columns(df.columns)
if test_s2.columns.duplicated().any():
    df = test_s2.copy()
    test_s2.columns = uniquify_columns(df.columns)
if test_s3.columns.duplicated().any():
    df = test_s3.copy()
    test_s3.columns = uniquify_columns(df.columns)
if test_s4.columns.duplicated().any():
    df = test_s4.copy()
    test_s4.columns = uniquify_columns(df.columns)


In [13]:
keep = list(range(134))+list(range(-7, 0))
test_s1 = test_s1.iloc[:, keep]
test_s2 = test_s2.iloc[:, keep]
test_s3 = test_s3.iloc[:, keep]
test_s4 = test_s4.iloc[:, keep]

In [14]:
train_df = pd.concat([test_s1, test_s2, test_s3, test_s4], axis = 0)

In [15]:
missing_percentage = train_df.isnull().sum() / len(train_df)
columns_to_keep = missing_percentage[missing_percentage < 0.1].index
test_s1 = test_s1[columns_to_keep]
test_s2 = test_s2[columns_to_keep]
test_s3 = test_s3[columns_to_keep]
test_s4 = test_s4[columns_to_keep]

In [17]:
test_s1

Unnamed: 0,MILLISEC,Accelerometer RKN^ accX,Accelerometer RKN^ accY,Accelerometer RKN^ accZ,Accelerometer HIP accX,Accelerometer HIP accY,Accelerometer HIP accZ,Accelerometer LUA^ accX,Accelerometer LUA^ accY,Accelerometer LUA^ accZ,...,InertialMeasurementUnit R-SHOE AngVelNavFrameY,InertialMeasurementUnit R-SHOE AngVelNavFrameZ,InertialMeasurementUnit R-SHOE Compass,Locomotion,HL_Activity,LL_Left_Arm,LL_Left_Arm_Object,LL_Right_Arm,LL_Right_Arm_Object,ML_Both_Arms
0,0,87.0,975.0,-287.0,11.0,1001.0,163.0,95.0,975.0,152.0,...,20.0,42.0,175.0,0,0,0,0,0,0,0
1,33,124.0,978.0,-389.0,-7.0,1014.0,199.0,124.0,968.0,123.0,...,17.0,31.0,175.0,0,0,0,0,0,0,0
2,67,102.0,996.0,-440.0,-49.0,1024.0,193.0,127.0,1001.0,113.0,...,-27.0,15.0,175.0,0,0,0,0,0,0,0
3,100,59.0,861.0,-384.0,-9.0,1023.0,202.0,110.0,1007.0,106.0,...,-26.0,-2.0,175.0,0,0,0,0,0,0,0
4,133,119.0,946.0,-426.0,-22.0,1026.0,188.0,98.0,1001.0,92.0,...,-22.0,-7.0,175.0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234656,1832015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,0,0,0,0,0,0,0
234657,1832048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,0,0,0,0,0,0,0
234658,1832082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,0,0,0,0,0,0,0
234659,1832115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,0,0,0,0,0,0,0


In [18]:
for col in test_s1.columns:
    test_s1[col] = test_s1[col].interpolate(method="linear").fillna(method="bfill").fillna(method="ffill")
    test_s2[col] = test_s2[col].interpolate(method="linear").fillna(method="bfill").fillna(method="ffill")
    test_s3[col] = test_s3[col].interpolate(method="linear").fillna(method="bfill").fillna(method="ffill")
    test_s4[col] = test_s4[col].interpolate(method="linear").fillna(method="bfill").fillna(method="ffill")


In [19]:
scaler = StandardScaler()
test_s1[test_s1.columns[:-7]] = scaler.fit_transform(test_s1[test_s1.columns[:-7]])
scaler = StandardScaler()
test_s2[test_s2.columns[:-7]] = scaler.fit_transform(test_s2[test_s2.columns[:-7]])
scaler = StandardScaler()
test_s3[test_s3.columns[:-7]] = scaler.fit_transform(test_s3[test_s3.columns[:-7]])
scaler = StandardScaler()
test_s4[test_s4.columns[:-7]] = scaler.fit_transform(test_s4[test_s4.columns[:-7]])

In [20]:
test_s1.to_csv(OUT_DIR / "S1.csv", index=False)
test_s2.to_csv(OUT_DIR / "S2.csv",  index=False)
test_s3.to_csv(OUT_DIR / "S3.csv", index=False)
test_s4.to_csv(OUT_DIR / "S4.csv",  index=False)