In [1]:
import pandas as pd
import os

def load_and_label_data(folder_path, filenames):
    data_frames = []
    for filename in filenames:
        file_path = os.path.join(folder_path, filename)
        if not os.path.exists(file_path):
            print(f"Warning: File not found at {file_path}. Skipping.")
            continue
        
        df = pd.read_csv(file_path)
        activity_name = os.path.splitext(os.path.basename(file_path))[0]
        df['activity'] = activity_name
        data_frames.append(df)
        
    return data_frames

def combine_and_clean_data(data_frames):
    combined_df = pd.concat(data_frames, ignore_index=True)
    combined_df.interpolate(method='linear', inplace=True, limit_direction='both')
    return combined_df

def segment_data_by_activity(df):
    return {activity: group for activity, group in df.groupby('activity')}

def main():
    folder_path = r"C:\Users\lahit\Downloads\wetransfer_implementation_2024-02-07_1029 (1)\dataset\dataset"
    
    filenames = [
        'walking.csv', 'sitting.csv', 'running.csv', 'lying.csv', 
        'jumping.csv', 'climbing_up.csv', 'climbing_down.csv'
    ]
    
    all_data_frames = load_and_label_data(folder_path, filenames)
    
    if not all_data_frames:
        print("No data was loaded. Exiting.")
        return

    combined_data = combine_and_clean_data(all_data_frames)
    
    activity_segments = segment_data_by_activity(combined_data)
    
    print("--- Sampled Segment: walking ---")
    if 'walking' in activity_segments:
        print(activity_segments['walking'].head())

    output_filename = 'combined_cleaned_data_modular.csv'
    combined_data.to_csv(output_filename, index=False)
    
    print(f"\n✅ Modular script execution complete.")
    print(f"Combined and cleaned data saved to '{output_filename}'")

if __name__ == "__main__":
    main()

  combined_df.interpolate(method='linear', inplace=True, limit_direction='both')


--- Sampled Segment: walking ---
   id     attr_time    attr_x    attr_y    attr_z activity  Unnamed: 4
0   1  1.435993e+12 -2.160767  9.400234  0.565032  walking    0.754174
1   2  1.435993e+12 -2.176928  9.395446  0.621295  walking    0.754174
2   3  1.435993e+12 -2.151190  9.382876  0.588974  walking    0.754174
3   4  1.435993e+12 -2.135029  9.319430  0.545878  walking    0.754174
4   5  1.435993e+12 -2.168548  9.306262  0.586579  walking    0.754174

✅ Modular script execution complete.
Combined and cleaned data saved to 'combined_cleaned_data_modular.csv'


In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Ensure the combined CSV file from your first script exists and is loaded
try:
    df = pd.read_csv('combined_cleaned_data_modular.csv')
except FileNotFoundError:
    print("Error: 'combined_cleaned_data_modular.csv' not found.")
    print("Please run the first data processing script to generate it.")
    # In a real script, you might exit here. In a notebook, just ensure the file exists.

# Define features and target
features = ['attr_x', 'attr_y', 'attr_z']
target = 'activity'

X = df[features].values
y = df[target]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"✅ Data prepared successfully. Shape of feature matrix: {X_scaled.shape}")

✅ Data prepared successfully. Shape of feature matrix: (389129, 3)


In [1]:
!pip install kmapper scikit-learn numpy pandas

import pandas as pd
import numpy as np
import kmapper as km
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.model_selection import train_test_split

sample_size = 30000
X_sample, _, y_sample, _ = train_test_split(
    X_scaled, y,
    train_size=sample_size,
    random_state=42,
    stratify=y
)

print(f"Original data shape: {X_scaled.shape}")
print(f"Using a subsample of shape: {X_sample.shape}")

mapper = km.KeplerMapper(verbose=1)

lens = mapper.project(
    X_sample,
    projection=PCA(n_components=2),
    scaler=None
)

mapper_graph = mapper.map(
    lens=lens,
    X=X_sample,
    cover=km.Cover(n_cubes=15, perc_overlap=0.4),
    clusterer=DBSCAN(eps=0.5, min_samples=5)
)

print("\n✅ Mapper graph constructed successfully on the subsample.")

activity_names = y_sample.unique()
activity_to_int = {name: i for i, name in enumerate(activity_names)}
color_values = y_sample.map(activity_to_int).values

mapper.visualize(
    mapper_graph,
    title="Topological Analysis of Human Activities (on Sampled Data)",
    path_html="human_activity_mapper_sampled.html",
    custom_tooltips=y_sample.values,
    color_values=color_values,
    color_function_name="Activity"
)

print("\n✅ Visualization complete!")
print("Please open 'human_activity_mapper_sampled.html' to explore the graph.")



NameError: name 'X_scaled' is not defined