# Daten Import

In [1]:
import pandas as pd

pfadDaten = "/Users/tuhin/Desktop/Bachelorarbeit/sapiagent/sapimouse_ownhumandata/user4/session_2024_12_30_3min.csv"

df = pd.read_csv(pfadDaten)

print(df.head())

   client timestamp    button state     x    y
0                 0  NoButton  Move  1213  656
1                 5  NoButton  Move  1213  655
2                11  NoButton  Move  1214  654
3                17  NoButton  Move  1215  652
4                25  NoButton  Move  1215  651


# Chunking

In [4]:
import pandas as pd



# 1. Sort by timestamp if not sorted
df = df.sort_values('client timestamp')

# 2. Compute the time difference to the previous row
df['time_diff'] = df['client timestamp'].diff().fillna(0)

# 3. Define a "new_chunk" marker where conditions are met
#    Condition A: time gap > 4000
#    Condition B: state == 'Released'
df['new_chunk'] = (
    (df['time_diff'] > 4000)    # large gap
    | (df['state'] == 'Released')  # or row is 'Released'
)

# 4. Convert that boolean into a cumulative sum
#    Each True increments the chunk ID
df['chunk_id'] = df['new_chunk'].cumsum()

# 5. (Optional) If you prefer the row with Released to be 
#    included in the preceding chunk rather than marking 
#    the start of the new chunk, you can adjust the logic as needed. 
#    For example, you might shift the condition or handle it differently.
#    But in this version, chunk_id changes on the same row that has "Released".

# Now each chunk is all rows that have the same chunk_id:
# For example, group by chunk_id:
groups = df.groupby('chunk_id')

for chunk_id, group_data in groups:
    print(f"Chunk ID: {chunk_id}")
    print(group_data)
    print("-----")


Chunk ID: 0
     client timestamp    button state     x    y  time_diff  new_chunk  \
0                   0  NoButton  Move  1213  656        0.0      False   
1                   5  NoButton  Move  1213  655        5.0      False   
2                  11  NoButton  Move  1214  654        6.0      False   
3                  17  NoButton  Move  1215  652        6.0      False   
4                  25  NoButton  Move  1215  651        8.0      False   
..                ...       ...   ...   ...  ...        ...        ...   
136              2908  NoButton  Move   683  141       19.0      False   
137              2927  NoButton  Move   683  140       19.0      False   
138              2934  NoButton  Move   682  139        7.0      False   
139              3650  NoButton  Move   682  140      716.0      False   
140              3799  NoButton  Move   682  141      149.0      False   

     chunk_id  
0           0  
1           0  
2           0  
3           0  
4           0  
.. 

# Chunking Visualization

In [13]:
import plotly.express as px


for chunk_id, subset in groups:
    fig = px.line(
        subset,
        x='x',
        y='y',
        markers=True,
        title=f"Chunk ID: {chunk_id}"
    )
    fig.show()


# Feature Berechnung

In [12]:
import pandas as pd
import numpy as np


rows = []

for chunk_id, group_data in groups:
    # Calculate distance between consecutive points
    group_data['distance'] = (
        (group_data['x'].diff()**2 + group_data['y'].diff()**2) ** 0.5
    ).fillna(0)
    
    # Calculate time_diff between consecutive points; assumed to be in group_data already
    # group_data['time_diff'] = ...
    
    # Calculate per-row velocity to get min and max
    # Avoid division by zero by replacing 0 with NaN or a small number if needed
    # Here, we’ll just replace time_diff == 0 with NaN:
    group_data.loc[group_data['time_diff'] == 0, 'time_diff'] = float('nan')
    group_data['row_velocity'] = group_data['distance'] / group_data['time_diff']

    # Replace NaN back to 0 for velocity, if desired
    group_data['row_velocity'] = group_data['row_velocity'].fillna(0)

    # Chunk-level total time
    total_time = group_data['time_diff'].sum(skipna=True)
    # Chunk-level total distance
    total_distance = group_data['distance'].sum()
    # Chunk-level average velocity (distance / time)
    velocity_chunk = total_distance / total_time if total_time > 0 else 0

    # Filter out rows where row_velocity is 0
    nonzero_velocities = group_data.loc[group_data['row_velocity'] != 0, 'row_velocity']

    # Compute the min from the non-zero velocities
    velocity_min = nonzero_velocities.min() if not nonzero_velocities.empty else 0

    velocity_max = group_data['row_velocity'].max()


    # Direkte Distanz zwischen erstem und letztem Punkt
    first_x, first_y = group_data.iloc[0]['x'], group_data.iloc[0]['y']
    last_x, last_y   = group_data.iloc[-1]['x'], group_data.iloc[-1]['y']
    direct_distance = np.sqrt((last_x - first_x)**2 + (last_y - first_y)**2)

    # Beispiel-Dauer (Summe aller time_diff)
    duration = total_time  # kann nach Bedarf in andere Einheiten (Min / Std) umgerechnet werden

    # Effizienz als direkte Distanz / Gesamtzeit
    # (falls du eine andere Definition für „Effizienz“ brauchst, entsprechend anpassen)
    efficiency = direct_distance / duration if duration > 0 else 0

    # Werte in das rows-Dictionary übernehmen
    rows.append({
        'chunk_id': chunk_id,
        'geschwindigkeit': velocity_chunk * 1000,          # Beispiel: Umrechnung in andere Einheit
        'geschwindigkeit_min': velocity_min * 1000,
        'geschwindigkeit_max': velocity_max * 1000,
        'dauer': duration,
        'direkte_distanz': direct_distance,
        'effizienz': efficiency,
        'totale_distanz': total_distance
    })

# Finally, build your DataFrame from the rows
df = pd.DataFrame(rows)

df


Unnamed: 0,chunk_id,geschwindigkeit,geschwindigkeit_min,geschwindigkeit_max,dauer,direkte_distanz,effizienz,totale_distanz
0,0,210.504109,0.689655,3655.285367,3799.0,739.720217,0.194714,799.70511
1,1,15.12593,76.923077,714.285714,10628.0,157.835357,0.014851,160.75838
2,2,124.894964,0.433463,3100.179206,10852.0,179.902752,0.016578,1355.360151
3,3,0.0,0.0,0.0,139.0,0.0,0.0,0.0
4,4,44.149095,0.473037,4336.537277,25376.0,486.173837,0.019159,1120.327429
5,5,34.577562,28.571429,4252.058325,27963.0,759.168624,0.027149,966.89237
6,6,418.36865,2.666667,3833.333333,1567.0,497.001006,0.317167,655.583674
7,7,0.0,0.0,0.0,101.0,0.0,0.0,0.0
8,8,40.765048,4.587156,1000.0,560.0,14.142136,0.025254,22.828427
9,9,375.290866,29.411765,1343.709625,441.0,162.012345,0.367375,165.503272
