In [1]:
import os 
import pandas as pd
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_index(string):
    """
    Helper function to extract the index from a line.
    get the index of the first alphabet in the string after the 9th index

    Args:
    line (str): Input line containing index information.

    Returns:
    int: Extracted index.
    """

    for i in range(9,len(string)):
        if string[i].isalpha():
            return i
    return -1

def extract_data():
    """
    Extracts data from .dat files in the OpportunityUCIDataset/dataset folder.

    Returns:
    pandas.DataFrame: Dataframe containing extracted data.
    """

    # Get all the .dat files in the dataset folder
    data_dir = 'OpportunityUCIDataset/dataset'
    files = os.listdir(data_dir)
    files = [f for f in files if f.endswith('.dat')]

    # Separate the ADL and Drill files
    list_of_files = [f for f in files if 'Drill' not in f]

    columns = []

    # Read column names from column_names.txt file
    with open(os.path.join(data_dir, "column_names.txt"), 'r') as f:
        lines = f.read().splitlines()

        for line in lines:
            if 'Column' in line:
                # Extract column names and append to the list
                columns.append(line[get_index(line):].split(";")[0])

    # Create an empty DataFrame with the extracted column names
    data_collection = pd.DataFrame(columns=columns)

    # Iterate over the list of files and concatenate data to the DataFrame
    for _, file in enumerate(list_of_files):
        proc_data = pd.read_table(os.path.join(data_dir, file), header=None, sep='\s+')
        proc_data.columns = columns
        data_collection = pd.concat([data_collection, proc_data])

    # Reset the DataFrame index
    data_collection.reset_index(drop=True, inplace=True)

    return data_collection
data_collection = extract_data()

data_collection

Unnamed: 0,MILLISEC,Accelerometer RKN^ accX,Accelerometer RKN^ accY,Accelerometer RKN^ accZ,Accelerometer HIP accX,Accelerometer HIP accY,Accelerometer HIP accZ,Accelerometer LUA^ accX,Accelerometer LUA^ accY,Accelerometer LUA^ accZ,...,LOCATION TAG4 X,LOCATION TAG4 Y,LOCATION TAG4 Z,Locomotion,HL_Activity,LL_Left_Arm,LL_Left_Arm_Object,LL_Right_Arm,LL_Right_Arm_Object,ML_Both_Arms
0,0,87.0,975.0,-287.0,11.0,1001.0,163.0,95.0,975.0,152.0,...,5789.0,2907.0,1447.0,0,0,0,0,0,0,0
1,33,124.0,978.0,-389.0,-7.0,1014.0,199.0,124.0,968.0,123.0,...,5789.0,2908.0,1443.0,0,0,0,0,0,0,0
2,67,102.0,996.0,-440.0,-49.0,1024.0,193.0,127.0,1001.0,113.0,...,5789.0,2910.0,1440.0,0,0,0,0,0,0,0
3,100,59.0,861.0,-384.0,-9.0,1023.0,202.0,110.0,1007.0,106.0,...,5789.0,2912.0,1440.0,0,0,0,0,0,0,0
4,133,119.0,946.0,-426.0,-22.0,1026.0,188.0,98.0,1001.0,92.0,...,5791.0,2915.0,1442.0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644630,1017390,,,,,,,,,,...,,,,0,0,0,0,0,0,0
644631,1017423,,,,,,,,,,...,,,,0,0,0,0,0,0,0
644632,1017456,,,,,,,,,,...,,,,0,0,0,0,0,0,0
644633,1017490,,,,,,,,,,...,,,,0,0,0,0,0,0,0


In [3]:
na_counts = data_collection.isna().mean().sort_values(ascending=False) * 100
print(na_counts)
row_na_counts = data_collection.isna().mean(axis=1).sort_values(ascending=False) * 100
print(row_na_counts)

Accelerometer RH accZ    44.244262
Accelerometer RH accY    44.244262
Accelerometer RH accX    44.244262
Accelerometer LH accZ    21.553592
Accelerometer LH accY    21.553592
                           ...    
LL_Left_Arm               0.000000
LL_Left_Arm_Object        0.000000
LL_Right_Arm              0.000000
LL_Right_Arm_Object       0.000000
ML_Both_Arms              0.000000
Length: 250, dtype: float64
416526    95.6
416368    95.6
416369    95.6
416370    95.6
416371    95.6
          ... 
247227     0.0
247228     0.0
247229     0.0
247230     0.0
247231     0.0
Length: 644635, dtype: float64


In [11]:
def uniquify_columns(columns):
    seen = {}
    out = []
    for c in columns:
        if c not in seen:
            seen[c] = 0
            out.append(c)
        else:
            seen[c] += 1
            out.append(f"{c}__dup{seen[c]}")
    return out

if data_collection.columns.duplicated().any():
    df = data_collection.copy()
    df.columns = uniquify_columns(df.columns)


In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# 1) Drop >10% missing cols/rows (same as above)
threshold = 0.10
df_pruned = df.loc[:, df.isna().mean() <= threshold]
df_pruned = df_pruned.loc[df_pruned.isna().mean(axis=1) <= threshold, :]

# (Optional) separate target if you have one
# y = df_pruned.pop("target")  # uncomment and set your target
# X = df_pruned
X = df_pruned.copy()

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent"))
])

preprocess = ColumnTransformer([
    ("num", numeric_pipe, num_cols),
    ("cat", categorical_pipe, cat_cols)
], remainder="drop")

# Split, then fit imputers ONLY on train
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

preprocess.fit(X_train)
X_train_imp = pd.DataFrame(preprocess.transform(X_train), columns=num_cols + cat_cols, index=X_train.index)
X_test_imp  = pd.DataFrame(preprocess.transform(X_test),  columns=num_cols + cat_cols, index=X_test.index)

# X_train_imp / X_test_imp are clean and imputed


In [13]:
X_train_imp.to_csv('train.csv', index=False)
X_test_imp.to_csv('test.csv', index=False)