# Daten Import

In [2]:
import pandas as pd

pd.set_option('display.max_columns', None)   # Show all columns
pd.set_option('display.max_rows', None)      # Show all rows if needed (optional)
pd.set_option('display.width', None)         # Let the notebook handle line wrapping
pd.set_option('display.max_colwidth', None)  

pfadDaten = "/Users/tuhin/Desktop/Bachelorarbeit/sapiagent/sapimouse_ownhumandata/user4/session_2024_12_30_3min.csv"

df = pd.read_csv(pfadDaten)

print(df.head())

# Chunking

In [3]:
import pandas as pd



# 1. Sort by timestamp if not sorted
df = df.sort_values('client timestamp')

# 2. Compute the time difference to the previous row
df['time_diff'] = df['client timestamp'].diff().fillna(0)

# 3. Define a "new_chunk" marker where conditions are met
#    Condition A: time gap > 4000
#    Condition B: state == 'Released'
df['new_chunk'] = (
    (df['time_diff'] > 4000)    # large gap
    | (df['state'] == 'Released')  # or row is 'Released'
)

# 4. Convert that boolean into a cumulative sum
#    Each True increments the chunk ID
df['chunk_id'] = df['new_chunk'].cumsum()

# 5. (Optional) If you prefer the row with Released to be 
#    included in the preceding chunk rather than marking 
#    the start of the new chunk, you can adjust the logic as needed. 
#    For example, you might shift the condition or handle it differently.
#    But in this version, chunk_id changes on the same row that has "Released".

# Now each chunk is all rows that have the same chunk_id:
# For example, group by chunk_id:
groups = df.groupby('chunk_id')

for chunk_id, group_data in groups:
    print(f"Chunk ID: {chunk_id}")
    print(group_data)
    print("-----")


# Chunking Visualization

In [4]:
import plotly.express as px


for chunk_id, subset in groups:
    fig = px.line(
        subset,
        x='x',
        y='y',
        markers=True,
        title=f"Chunk ID: {chunk_id}"
    )
    fig.show()


# Feature Berechnung

In [5]:
def smoothness(values):
    n = len(values)
    total_diff = 0.0

    if n < 2:
        return 0.0

    for i in range(1, n):
        total_diff += abs(values[i] - values[i-1])
    return total_diff / (n - 1)



In [6]:
import pandas as pd
import numpy as np




rows = []

for chunk_id, group_data in groups:
    # Calculate distance between consecutive points
    group_data['distance'] = (
        (group_data['x'].diff()**2 + group_data['y'].diff()**2) ** 0.5
    ).fillna(0)
    
    # Calculate time_diff between consecutive points; assumed to be in group_data already
    # group_data['time_diff'] = ...
    
    # Calculate per-row velocity to get min and max
    # Avoid division by zero by replacing 0 with NaN or a small number if needed
    # Here, we’ll just replace time_diff == 0 with NaN:
    group_data.loc[group_data['time_diff'] == 0, 'time_diff'] = float('nan')
    group_data['row_velocity'] = group_data['distance'] / group_data['time_diff']

    # Replace NaN back to 0 for velocity, if desired
    group_data['row_velocity'] = group_data['row_velocity'].fillna(0)

    # Differenz der Geschwindigkeiten:
    velocity_diff = group_data['row_velocity'].diff()  # v[i] - v[i-1]

    # Beschleunigung: diff in px/s / time_diff (ms) => px/s^2
    group_data['row_acceleration'] = (velocity_diff / group_data['time_diff']) * 1000000
    group_data['row_acceleration'] = group_data['row_acceleration'].fillna(0)

    # Chunk-level total time
    total_time = group_data['time_diff'].sum(skipna=True)
    # Chunk-level total distance
    total_distance = group_data['distance'].sum()
    # Chunk-level average velocity (distance / time)
    velocity_chunk = total_distance / total_time if total_time > 0 else 0

    # Filter out rows where row_velocity is 0
    nonzero_velocities = group_data.loc[group_data['row_velocity'] != 0, 'row_velocity']

    # Compute the min from the non-zero velocities
    velocity_min = nonzero_velocities.min() if not nonzero_velocities.empty else 0
    velocity_max = group_data['row_velocity'].max()
    velocity_mean = group_data['row_velocity'].mean()
    velocity_var = group_data['row_velocity'].var()

    # Beschleunigung auf Chunk-Ebene
    acc_min = group_data['row_acceleration'].min()
    acc_max = group_data['row_acceleration'].max()
    acc_mean = group_data['row_acceleration'].mean()
    acc_var = group_data['row_acceleration'].var()



    # Direkte Distanz zwischen erstem und letztem Punkt
    first_x, first_y = group_data.iloc[0]['x'], group_data.iloc[0]['y']
    last_x, last_y   = group_data.iloc[-1]['x'], group_data.iloc[-1]['y']
    direct_distance = np.sqrt((last_x - first_x)**2 + (last_y - first_y)**2)

    # Beispiel-Dauer (Summe aller time_diff)
    duration = total_time  # kann nach Bedarf in andere Einheiten (Min / Std) umgerechnet werden

    # Effizienz als direkte Distanz / Gesamtzeit
    # (falls du eine andere Definition für „Effizienz“ brauchst, entsprechend anpassen)
    efficiency = direct_distance / duration if duration > 0 else 0



    # If you also want the absolute difference (|delta_x|):
    group_data['abs_delta_x'] = group_data['x'].diff().abs()
    smoothness_v1 = group_data['abs_delta_x'].mean()

        
    smoothness_v2 = smoothness(group_data['x'].values)











    # Werte in das rows-Dictionary übernehmen
    rows.append({
        'chunk_id': chunk_id,
        'geschwindigkeit': velocity_chunk * 1000,
        'geschwindigkeit_min': velocity_min * 1000,
        'geschwindigkeit_max': velocity_max * 1000,
        'geschwindigkeit_mean': velocity_mean * 1000,
        'geschwindigkeit_var': velocity_var * 1000,
        'dauer': duration,
        'direkte_distanz': direct_distance,
        'effizienz': efficiency,
        'totale_distanz': total_distance,
        'smoothness_v1': smoothness_v1,
        'smoothness_v2': smoothness_v2,
        'beschleunigung_min':  acc_min,
        'beschleunigung_max':  acc_max,
        'beschleunigung_mean': acc_mean,
        'beschleunigung_var':  acc_var,

    })

# Finally, build your DataFrame from the rows
df = pd.DataFrame(rows)

# ------------------------------------------------------
# 4) Z-SCORE UND MINMAX-SCALING FÜR JEDES FEATURE
# ------------------------------------------------------
# Legen Sie fest, welche Spalten Sie skalieren möchten
columns_to_scale = [
    'geschwindigkeit', 'geschwindigkeit_min', 'geschwindigkeit_max',
    'geschwindigkeit_mean', 'geschwindigkeit_var',
    'beschleunigung_min', 'beschleunigung_max',
    'beschleunigung_mean', 'beschleunigung_var',
    'dauer', 'direkte_distanz', 'effizienz', 'totale_distanz',
    'smoothness_v1', 'smoothness_v2'
]

# 4a) Z-SCORE => (X - mean)/std
for col in columns_to_scale:
    mean_val = df[col].mean()
    std_val  = df[col].std()
    # Falls std_val=0 => Division durch 0 vermeiden
    if std_val == 0:
        df[col + '_zscore'] = 0
    else:
        df[col + '_zscore'] = (df[col] - mean_val) / std_val

# 4b) MIN-MAX-SCALING => (X - min)/(max - min)
for col in columns_to_scale:
    min_val = df[col].min()
    max_val = df[col].max()
    if max_val == min_val:
        df[col + '_minmax'] = 0  # oder 1
    else:
        df[col + '_minmax'] = (df[col] - min_val) / (max_val - min_val)


df

In [11]:

df['is_anomaly'] = 0
df = df.dropna()

In [12]:
df

In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.metrics import confusion_matrix
from sklearn.impute import SimpleImputer

# -------------------------------
# 1. Prepare the Data
# -------------------------------

# Assume your original DataFrame is named df.
# We drop 'chunk_id' (an identifier) and 'is_anomaly' (the label)
df_features = df.drop(columns=['chunk_id', 'is_anomaly'], errors='ignore')

# Impute missing values in all feature columns using the mean
imputer = SimpleImputer(strategy='mean')
df_features_imputed = pd.DataFrame(imputer.fit_transform(df_features), 
                                   columns=df_features.columns)

# Convert features to a NumPy array
X = df_features_imputed.values

# Get the labels (assuming 0 = normal, 1 = anomaly)
y = df['is_anomaly'].values

# -------------------------------
# 2. Train/Test Split (Preserve Row Indices)
# -------------------------------
# We also pass df.index to preserve original row numbers for test samples.
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X, y, df.index, test_size=0.2, random_state=42
)

# For One-Class SVM training, use only the "normal" samples (assume normal = 0)
X_train_normal = X_train[y_train == 0]

# -------------------------------
# 3. Scaling (if needed)
# -------------------------------
scaler = StandardScaler()
X_train_normal_scaled = scaler.fit_transform(X_train_normal)
X_test_scaled = scaler.transform(X_test)

# -------------------------------
# 4. Train One-Class SVM
# -------------------------------
clf = OneClassSVM(kernel='rbf', nu=0.1, gamma='auto')
clf.fit(X_train_normal_scaled)

# -------------------------------
# 5. Predict and Evaluate
# -------------------------------
y_test_pred = clf.predict(X_test_scaled)
# One-Class SVM returns +1 for normal and -1 for anomalies.
# Map +1 -> 0 (normal) and -1 -> 1 (anomaly)
y_test_pred_mapped = np.where(y_test_pred == 1, 0, 1)

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_test_pred_mapped)
print("Confusion Matrix:")
print(cm)

# Identify indices for each category
tn_idx = (y_test == 0) & (y_test_pred_mapped == 0)  # True Normal
fp_idx = (y_test == 0) & (y_test_pred_mapped == 1)  # False Anomaly (Normal flagged as anomaly)
fn_idx = (y_test == 1) & (y_test_pred_mapped == 0)  # False Normal (Anomaly flagged as normal)
tp_idx = (y_test == 1) & (y_test_pred_mapped == 1)  # True Anomaly

print("\nCounts:")
print("True Normal (TN):", np.sum(tn_idx))
print("False Anomaly (FP):", np.sum(fp_idx))
print("False Normal (FN):", np.sum(fn_idx))
print("True Anomaly (TP):", np.sum(tp_idx))

# -------------------------------
# 6. Retrieve Original Row Numbers for Each Category
# -------------------------------
# idx_test holds the original row numbers for the test set.
tn_rows = idx_test[tn_idx]
fp_rows = idx_test[fp_idx]
fn_rows = idx_test[fn_idx]
tp_rows = idx_test[tp_idx]

print("\nRow Numbers:")
print("True Normal (TN) row indices:", tn_rows.tolist())
print("False Anomaly (FP) row indices:", fp_rows.tolist())
print("False Normal (FN) row indices:", fn_rows.tolist())
print("True Anomaly (TP) row indices:", tp_rows.tolist())


# Bot Daten in richtiges Format bringen

In [18]:
import os
import json
import csv
from datetime import datetime

# Pfad zum Ordner mit den JSON-Dateien
json_folder_path = '/Users/tuhin/Desktop/Bachelorarbeit/sapiagent/feature_berechnung_ownhuman/seleniumTestdaten'
# Pfad zum Ordner für die Ausgabe-CSV-Dateien
csv_output_folder_path = '/Users/tuhin/Desktop/Bachelorarbeit/sapiagent/feature_berechnung_ownhuman/bot_nutzbar'

# Erstellen Sie den Ausgabeordner, falls er nicht existiert
os.makedirs(csv_output_folder_path, exist_ok=True)

# Benutzerzähler initialisieren
user_counter = 1

# Alle JSON-Dateien im Ordner durchlaufen
anzahkSkips = 0
anzahlPC = 0
for filename in os.listdir(json_folder_path):
    if filename.endswith('.json'):
        json_file_path = os.path.join(json_folder_path, filename)
        
        # JSON-Datei lesen
        with open(json_file_path, 'r') as json_file:
            data = json.load(json_file)
        
        
        if "Android" in data["userAgent"] or "iPhone" in data["userAgent"]:
            anzahkSkips += 1
            continue

        if "Windows" in data["userAgent"] or "Macintosh" in data["userAgent"]:
            anzahlPC += 1



        # Den ersten Zeitstempel als Startzeitpunkt festlegen
        start_timestamp = data['mouseEvents'][0]['timestamp']
        
        # Benutzerordner erstellen
        user_folder = os.path.join(csv_output_folder_path, f'user{user_counter}')
        os.makedirs(user_folder, exist_ok=True)
        
        # Datum aus dem Dateinamen extrahieren
        date_str = filename.split('_')[1]
        date_obj = datetime.strptime(date_str, '%Y%m%d')
        formatted_date = date_obj.strftime('%Y_%m_%d')
        
        # Pfad zur Ausgabe-CSV-Datei
        csv_file_name = f'session_{formatted_date}_3min.csv'
        csv_file_path = os.path.join(user_folder, csv_file_name)
        
        # CSV-Datei schreiben
        with open(csv_file_path, 'w', newline='') as csv_file:
            csv_writer = csv.writer(csv_file)
            # CSV-Header schreiben
            csv_writer.writerow(['client timestamp', 'button', 'state', 'x', 'y'])
            
            # MouseEvents in CSV-Format konvertieren und schreiben
            for event in data['mouseEvents']:
                relative_timestamp = event['timestamp'] - start_timestamp
                button_state = 'NoButton'
                if event['button'] == 1:
                    button_state = 'LeftButton'
                elif event['button'] == 2:
                    button_state = 'RightButton'
                
                # Mausereignistypen ändern
                event_type = event['type']
                if event_type == 'mousemove':
                    event_type = 'Move'
                elif event_type == 'mousedown':
                    event_type = 'Pressed'
                elif event_type == 'mouseup':
                    event_type = 'Released'
                
                csv_writer.writerow([relative_timestamp, button_state, event_type, event['x'], event['y']])
        
        # Benutzerzähler erhöhen
        user_counter += 1

print(f"Anzahl der Skips: {anzahkSkips}")
print(f"Anzahl der PC: {anzahlPC}")
print("Alle Mouse events wurden erfolgreich konvertiert und gespeichert.")

In [19]:
df.columns

Index(['chunk_id', 'geschwindigkeit', 'geschwindigkeit_min',
       'geschwindigkeit_max', 'geschwindigkeit_mean', 'geschwindigkeit_var',
       'dauer', 'direkte_distanz', 'effizienz', 'totale_distanz',
       'smoothness_v1', 'smoothness_v2', 'beschleunigung_min',
       'beschleunigung_max', 'beschleunigung_mean', 'beschleunigung_var',
       'geschwindigkeit_zscore', 'geschwindigkeit_min_zscore',
       'geschwindigkeit_max_zscore', 'geschwindigkeit_mean_zscore',
       'geschwindigkeit_var_zscore', 'beschleunigung_min_zscore',
       'beschleunigung_max_zscore', 'beschleunigung_mean_zscore',
       'beschleunigung_var_zscore', 'dauer_zscore', 'direkte_distanz_zscore',
       'effizienz_zscore', 'totale_distanz_zscore', 'smoothness_v1_zscore',
       'smoothness_v2_zscore', 'geschwindigkeit_minmax',
       'geschwindigkeit_min_minmax', 'geschwindigkeit_max_minmax',
       'geschwindigkeit_mean_minmax', 'geschwindigkeit_var_minmax',
       'beschleunigung_min_minmax', 'beschleunigun

In [None]:
Index(['chunk_id', 'geschwindigkeit', 'geschwindigkeit_min',
       'geschwindigkeit_max', 'geschwindigkeit_mean', 'geschwindigkeit_var',
       'dauer', 'direkte_distanz', 'effizienz', 'totale_distanz',
       'smoothness_v1', 'smoothness_v2', 'beschleunigung_min',
       'beschleunigung_max', 'beschleunigung_mean', 'beschleunigung_var',
       'geschwindigkeit_zscore', 'geschwindigkeit_min_zscore',
       'geschwindigkeit_max_zscore', 'geschwindigkeit_mean_zscore',
       'geschwindigkeit_var_zscore', 'beschleunigung_min_zscore',
       'beschleunigung_max_zscore', 'beschleunigung_mean_zscore',
       'beschleunigung_var_zscore', 'dauer_zscore', 'direkte_distanz_zscore',
       'effizienz_zscore', 'totale_distanz_zscore', 'smoothness_v1_zscore',
       'smoothness_v2_zscore', 'geschwindigkeit_minmax',
       'geschwindigkeit_min_minmax', 'geschwindigkeit_max_minmax',
       'geschwindigkeit_mean_minmax', 'geschwindigkeit_var_minmax',
       'beschleunigung_min_minmax', 'beschleunigung_max_minmax',
       'beschleunigung_mean_minmax', 'beschleunigung_var_minmax',
       'dauer_minmax', 'direkte_distanz_minmax', 'effizienz_minmax',
       'totale_distanz_minmax', 'smoothness_v1_minmax', 'smoothness_v2_minmax',
       'is_anomaly'],
      dtype='object')