In [33]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [77]:
# Load the data
pivot_data4 = pd.read_csv('/Users/emilkoch/Desktop/2Tango/messenger/Research/Weeks/Week_14/pivot_data4.csv')

# Extract relevant sensor data
sensor_data = pivot_data4[['accel_x', 'accel_y', 'accel_z', 'gyro_x', 'gyro_y', 'gyro_z']]

# Standardize the data
scaler = StandardScaler()
sensor_data_scaled = scaler.fit_transform(sensor_data)

# Calculate WCSS for different number of clusters
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(sensor_data_scaled)
    wcss.append(kmeans.inertia_)
    print(f"Number of clusters: {i}, WCSS: {kmeans.inertia_}")

# Calculate the differences in WCSS
differences = np.diff(wcss)

# Find the elbow point (optimal number of clusters)
elbow_point = np.argmax(differences) + 1
print("Optimal number of clusters (Elbow point):", elbow_point)

from sklearn.cluster import KMeans

# Initialize KMeans with 5 clusters
n_clusters = elbow_point
kmeans = KMeans(n_clusters=n_clusters, random_state=0)

# Fit KMeans to your scaled sensor data
cluster_labels = kmeans.fit_predict(sensor_data_scaled)

# Convert standardized sensor data and cluster labels to a DataFrame
sensor_data_with_labels = pd.DataFrame(sensor_data_scaled, columns=sensor_data.columns)
sensor_data_with_labels['cluster_label'] = cluster_labels
sensor_data = sensor_data_with_labels  # Use the DataFrame with cluster labels

sensor_data['timestamp'] = pivot_data4['timestamp']

# Initialize a new column for duration
sensor_data['duration'] = 0

# Initialize variables to store previous cluster label and its timestamp
prev_cluster_label = None
earliest_timestamp = None

# Iterate through each row in the DataFrame
for index, row in sensor_data.iterrows():
    # Check if the cluster label changes compared to the previous row
    if row['cluster_label'] != prev_cluster_label:
        # If yes, calculate the duration
        if earliest_timestamp is not None:
            duration = abs(row['timestamp'] - earliest_timestamp)  # Calculate the duration (ensure positive)
            # Update the duration for the previous row
            sensor_data.at[index - 1, 'duration'] = duration
        # Update the previous cluster label and its timestamp
        prev_cluster_label = row['cluster_label']
        earliest_timestamp = row['timestamp']

# For the last segment, calculate the duration
if earliest_timestamp is not None:
    last_index = sensor_data.index[-1]
    duration = abs(sensor_data.at[last_index, 'timestamp'] - earliest_timestamp)
    sensor_data.at[last_index, 'duration'] = duration

Number of clusters: 1, WCSS: 1041972.0000007446
Number of clusters: 2, WCSS: 857386.5847300735
Number of clusters: 3, WCSS: 751123.6473178607
Number of clusters: 4, WCSS: 659282.7080355815
Number of clusters: 5, WCSS: 575146.6050043418
Number of clusters: 6, WCSS: 493105.5296016998
Number of clusters: 7, WCSS: 437383.76971968287
Number of clusters: 8, WCSS: 404343.82742491656
Number of clusters: 9, WCSS: 378110.5549230031
Number of clusters: 10, WCSS: 347198.8808746446
Optimal number of clusters (Elbow point): 8


  super()._check_params_vs_input(X, default_n_init=10)


In [86]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import load_model, save_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras.layers import Dense, Dropout, BatchNormalization
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM


# Split the data into features (X) and target variable (y)
X = sensor_data[['accel_x', 'accel_y', 'accel_z', 'gyro_x', 'gyro_y', 'gyro_z', 'timestamp', 'duration']]
y = sensor_data['cluster_label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define the model
model = Sequential([
    Dense(64, activation='relu', kernel_regularizer=regularizers.l1(0.001), input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    BatchNormalization(),
    Dense(64, activation='relu', kernel_regularizer=regularizers.l1(0.001)),  # Removed extra parenthesis here
    Dropout(0.5),
    BatchNormalization(),
    Dense(len(np.unique(y)), activation='softmax') 
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Encode class labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

# Compute class weights using the encoded labels
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train_encoded), y=y_train_encoded)

# Create a class weight dictionary
class_weight_dict = dict(zip(le.transform(le.classes_), class_weights))

# Train the model with class weights and early stopping
history = model.fit(X_train, y_train_encoded, epochs=15, validation_data=(X_val, y_val), class_weight=class_weight_dict)

# Save the model
file_path = "/Users/emilkoch/Desktop/2Tango/messenger/research/Model_Save/cluster.keras"
save_model(model, file_path)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_accuracy)

# Make predictions
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)

# Calculate precision, recall, and F1 score
report = classification_report(y_test, y_pred)
print(report)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test Accuracy: 0.8848357200622559
              precision    recall  f1-score   support

           0       0.95      0.94      0.94     32806
           1       0.71      0.01      0.02       588
           2       0.00      0.00      0.00        91
           3       0.00      0.00      0.00       515
           4       0.89      0.08      0.15        99
           5       0.00      0.00      0.00       538
           6       0.01      0.27      0.02        77
           7       0.16      0.16      0.16        19

    accuracy                           0.88     34733
   macro avg       0.34      0.18      0.16     34733
weighted avg       0.91      0.88      0.89     34733



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
# Initialize a new column for duration
sensor_data['duration'] = 0

# Initialize variables to store previous cluster label and its timestamp
prev_cluster_label = None
earliest_timestamp = None

# Iterate through each row in the DataFrame
for index, row in sensor_data.iterrows():
    # Check if the cluster label changes compared to the previous row
    if row['cluster_label'] != prev_cluster_label:
        # If yes, calculate the duration
        if earliest_timestamp is not None:
            duration = abs(row['timestamp'] - earliest_timestamp)  # Calculate the duration (ensure positive)
            # Update the duration for the previous row
            sensor_data.at[index - 1, 'duration'] = duration
        # Update the previous cluster label and its timestamp
        prev_cluster_label = row['cluster_label']
        earliest_timestamp = row['timestamp']

# For the last segment, calculate the duration
if earliest_timestamp is not None:
    last_index = sensor_data.index[-1]
    duration = abs(sensor_data.at[last_index, 'timestamp'] - earliest_timestamp)
    sensor_data.at[last_index, 'duration'] = duration

In [40]:
# Calculate the mean duration
mean_duration = sensor_data['duration'].mean()

# Define function to map duration values to descriptions
def map_duration(duration):
    if duration > mean_duration:
        return 'long'
    elif duration < mean_duration:
        return 'short'
    else:
        return 'moderate'

# Apply the function row-wise to create the new column 'duration_name'
sensor_data['duration_name'] = sensor_data['duration'].apply(map_duration)

In [41]:
mean_accel_x = sensor_data['accel_x'].mean()
mean_accel_y = sensor_data['accel_y'].mean()
mean_accel_z = sensor_data['accel_z'].mean()

mean_gyro_x = sensor_data['gyro_x'].mean()
mean_gyro_y = sensor_data['gyro_y'].mean()
mean_gyro_z = sensor_data['gyro_z'].mean()

# Define function to map duration values to descriptions
def map_accel(accel_x):
    if accel_x > mean_accel_x:
        return 'high'
    elif accel_x < mean_accel_x:
        return 'low'
    else:
        return 'medium'

# Apply the function row-wise to create the new column 'duration_name'
sensor_data['accel_x_name'] = sensor_data['accel_x'].apply(map_accel)

In [42]:
# Define function to map duration values to descriptions
def map_accely(accel_y):
    if accel_y > mean_accel_y:
        return 'high'
    elif accel_y < mean_accel_y:
        return 'low'
    else:
        return 'medium'

# Apply the function row-wise to create the new column 'duration_name'
sensor_data['accel_y_name'] = sensor_data['accel_y'].apply(map_accely)

In [43]:
# Define function to map duration values to descriptions
def map_accelz(accel_z):
    if accel_z > mean_accel_z:
        return 'high'
    elif accel_z < mean_accel_z:
        return 'low'
    else:
        return 'medium'

# Apply the function row-wise to create the new column 'duration_name'
sensor_data['accel_z_name'] = sensor_data['accel_z'].apply(map_accelz)

In [44]:
# Define function to map duration values to descriptions
def map_gyrox(gyro_x):
    if gyro_x > mean_gyro_x:
        return 'high'
    elif gyro_x < mean_gyro_x:
        return 'low'
    else:
        return 'medium'

# Apply the function row-wise to create the new column 'duration_name'
sensor_data['gyro_x_name'] = sensor_data['gyro_x'].apply(map_gyrox)

In [45]:
# Define function to map duration values to descriptions
def map_gyroy(gyro_y):
    if gyro_y > mean_gyro_y:
        return 'high'
    elif gyro_y < mean_gyro_y:
        return 'low'
    else:
        return 'medium'

# Apply the function row-wise to create the new column 'duration_name'
sensor_data['gyro_y_name'] = sensor_data['gyro_y'].apply(map_gyroy)

In [46]:
# Define function to map duration values to descriptions
def map_gyroz(gyro_z):
    if gyro_z > mean_gyro_z:
        return 'high'
    elif gyro_z < mean_gyro_z:
        return 'low'
    else:
        return 'medium'

# Apply the function row-wise to create the new column 'duration_name'
sensor_data['gyro_z_name'] = sensor_data['gyro_z'].apply(map_gyroz)

In [47]:
sensor_data

Unnamed: 0,accel_x,accel_y,accel_z,gyro_x,gyro_y,gyro_z,cluster_label,timestamp,duration,duration_name,accel_x_name,accel_y_name,accel_z_name,gyro_x_name,gyro_y_name,gyro_z_name
0,0.112622,0.164773,-0.047704,0.008060,-0.000313,-0.003003,0,465,1,long,high,high,low,high,low,low
1,-0.123533,0.675206,8.227909,0.008060,-0.000313,-0.003003,4,466,0,short,low,high,high,high,low,low
2,0.112622,0.164773,-0.047704,0.008060,-0.000313,-0.003003,0,466,0,short,high,high,low,high,low,low
3,0.112622,0.164773,-0.047704,0.008060,-0.000313,-0.003003,0,466,0,short,high,high,low,high,low,low
4,0.112622,0.164773,-0.047704,0.030278,-0.081585,0.026196,0,466,0,short,high,high,low,high,low,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173657,0.112622,0.164773,-0.047704,0.008060,-0.000313,-0.003003,0,1221,0,short,high,high,low,high,low,low
173658,0.112622,0.164773,-0.047704,0.008060,-0.000313,-0.003003,0,1221,0,short,high,high,low,high,low,low
173659,0.112622,0.164773,-0.047704,0.008060,-0.000313,-0.003003,0,1221,0,short,high,high,low,high,low,low
173660,0.112622,0.164773,-0.047704,0.008060,-0.000313,-0.003003,0,1221,0,short,high,high,low,high,low,low


In [48]:
# Initialize an empty list to store the text data
text_data = []

# Iterate over the rows of sensor_data
for index, row in sensor_data.iterrows():
    # Extract the cluster label
    cluster_label = row['cluster_label']
    
    # Extract the gyro values from columns ending with "_name" and customize the column names
    gyro_values = [f"gyroscope {col.split('_')[1]}: {row[col]}" for col in ['gyro_x_name', 'gyro_y_name', 'gyro_z_name']]
    
    # Combine the gyro values into a string
    gyro_values_str = ", ".join(gyro_values)
    
    # Extract the accel values from columns ending with "_name" and customize the column names
    accel_values = [f"accelerometer {col.split('_')[1]}: {row[col]}" for col in ['accel_x_name', 'accel_y_name', 'accel_z_name']]
    
    # Combine the accel values into a string
    accel_values_str = ", ".join(accel_values)
    
    # Extract the duration value for the current row
    duration_value = row['duration_name']
    
    # Create the text for the current row
    text = f"[CLS] {gyro_values_str}; {accel_values_str}; duration: {duration_value}; [SEP]"
    
    # Append the text and cluster label to the list
    text_data.append({"labels": f"{cluster_label}", "text": text})

# Convert the list of dictionaries to a DataFrame
text_df = pd.DataFrame(text_data)


In [49]:
text_df_pathway = '/Users/emilkoch/Desktop/2Tango/messenger/Research/Weeks/Week_14/text_df.csv'

text_df.to_csv(text_df_pathway, index = False)

In [50]:
subset = text_df.head(100)

In [51]:
subset

Unnamed: 0,labels,text
0,0,"[CLS] gyroscope x: high, gyroscope y: low, gyr..."
1,4,"[CLS] gyroscope x: high, gyroscope y: low, gyr..."
2,0,"[CLS] gyroscope x: high, gyroscope y: low, gyr..."
3,0,"[CLS] gyroscope x: high, gyroscope y: low, gyr..."
4,0,"[CLS] gyroscope x: high, gyroscope y: low, gyr..."
...,...,...
95,0,"[CLS] gyroscope x: high, gyroscope y: low, gyr..."
96,0,"[CLS] gyroscope x: high, gyroscope y: low, gyr..."
97,0,"[CLS] gyroscope x: high, gyroscope y: low, gyr..."
98,0,"[CLS] gyroscope x: high, gyroscope y: low, gyr..."


In [52]:
subset_pathway = '/Users/emilkoch/Desktop/2Tango/messenger/Research/Weeks/Week_14/subset.csv'

subset.to_csv(subset_pathway, index = False)

In [55]:
subset_test = text_df.iloc[100:150]
subset_test_pathway = '/Users/emilkoch/Desktop/2Tango/messenger/Research/Weeks/Week_14/subset_test.csv'

subset_test.to_csv(subset_test_pathway, index = False)

In [53]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForMaskedLM, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

# Tokenize and encode the training examples
tokenized_datasets = TextDataset(
    tokenizer=tokenizer,
    file_path="/Users/emilkoch/Desktop/2Tango/messenger/Research/Weeks/Week_14/subset.csv",  # Path to the saved text file
    block_size=128,  # Specify the maximum sequence length
)

# Define data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="/Users/emilkoch/Desktop/2Tango/Data Files/Dataset_2_glasses/finetuned_bert",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir="/Users/emilkoch/Desktop/2Tango/Data Files/Dataset_2_glasses/logs",
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("/Users/emilkoch/Desktop/2Tango/Data Files/Dataset_2_glasses/finetuned_bert")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/27 [00:00<?, ?it/s]

{'train_runtime': 449.0531, 'train_samples_per_second': 0.461, 'train_steps_per_second': 0.06, 'train_loss': 0.5775577757093642, 'epoch': 3.0}


In [72]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForMaskedLM

# Load the subset_test DataFrame
subset_test = pd.read_csv("/Users/emilkoch/Desktop/2Tango/messenger/Research/Weeks/Week_14/subset_test.csv")

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained("/Users/emilkoch/Desktop/2Tango/Data Files/Dataset_2_glasses/finetuned_bert")

# Initialize an empty list to store predicted motion type descriptions
predicted_descriptions = []

# Iterate over each text input in subset_test
for text_input in subset_test['text']:
    # Define the prompt for the current text input
    prompt = f"If the sensor readings: {text_input} are observed, predict motion type [MASK]."
    
    # Tokenize text input and prompt
    tokenized_input = tokenizer(prompt, return_tensors="pt")
    
    # Mask token representing the target motion type description
    mask_index = torch.where(tokenized_input["input_ids"] == tokenizer.mask_token_id)[1]
    
    # Generate predictions
    with torch.no_grad():
        outputs = model(**tokenized_input)
    
    # Retrieve predicted token IDs
    predictions = outputs.logits
    predicted_token_ids = torch.argmax(predictions[0, mask_index], dim=1)
    
    # Decode predicted token IDs to obtain predicted motion type description
    predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids.tolist())
    
    # Append predicted motion type description to the list
    predicted_descriptions.append(predicted_tokens[0])

# Add predicted motion type descriptions to the subset_test DataFrame
subset_test['predicted_description'] = predicted_descriptions

# Print the DataFrame with predicted descriptions
print(subset_test)

    labels                                               text  \
0        0  [CLS] gyroscope x: high, gyroscope y: low, gyr...   
1        0  [CLS] gyroscope x: high, gyroscope y: low, gyr...   
2        0  [CLS] gyroscope x: high, gyroscope y: low, gyr...   
3        0  [CLS] gyroscope x: high, gyroscope y: low, gyr...   
4        4  [CLS] gyroscope x: high, gyroscope y: low, gyr...   
5        0  [CLS] gyroscope x: high, gyroscope y: low, gyr...   
6        0  [CLS] gyroscope x: high, gyroscope y: low, gyr...   
7        0  [CLS] gyroscope x: high, gyroscope y: low, gyr...   
8        0  [CLS] gyroscope x: high, gyroscope y: low, gyr...   
9        0  [CLS] gyroscope x: high, gyroscope y: low, gyr...   
10       0  [CLS] gyroscope x: high, gyroscope y: low, gyr...   
11       0  [CLS] gyroscope x: high, gyroscope y: low, gyr...   
12       0  [CLS] gyroscope x: high, gyroscope y: high, gy...   
13       0  [CLS] gyroscope x: high, gyroscope y: low, gyr...   
14       0  [CLS] gyrosco

In [73]:
subset_test

Unnamed: 0,labels,text,predicted_description
0,0,"[CLS] gyroscope x: high, gyroscope y: low, gyr...",0
1,0,"[CLS] gyroscope x: high, gyroscope y: low, gyr...",0
2,0,"[CLS] gyroscope x: high, gyroscope y: low, gyr...",0
3,0,"[CLS] gyroscope x: high, gyroscope y: low, gyr...",0
4,4,"[CLS] gyroscope x: high, gyroscope y: low, gyr...",0
5,0,"[CLS] gyroscope x: high, gyroscope y: low, gyr...",0
6,0,"[CLS] gyroscope x: high, gyroscope y: low, gyr...",0
7,0,"[CLS] gyroscope x: high, gyroscope y: low, gyr...",0
8,0,"[CLS] gyroscope x: high, gyroscope y: low, gyr...",0
9,0,"[CLS] gyroscope x: high, gyroscope y: low, gyr...",0


In [70]:
subset_test['predicted_description'].unique()

array([';'], dtype=object)

In [141]:
subset_test['predicted_motion_type'].unique()

array(['0'], dtype=object)