In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_auc_score
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
import torch
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load data
def load_np_array(file_name):
    X_array = np.load('data/X_' + file_name + '_array.npy')
    y_array = np.load('data/y_' + file_name + '_array.npy')
    return X_array, y_array

# Convert numerical data to strings
def convert_to_string(data):
    return [" ".join(map(str, seq.flatten())) for seq in data]

# Define the compute_metrics function to calculate accuracy
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(labels, pred)
    return {"eval_accuracy": accuracy}


# Create Dataset class
class FallDetectionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [3]:
X_train_fall, y_train_fall = load_np_array("train_fall")
X_train_notfall, y_train_notfall = load_np_array("train_notfall")
X_test_fall, y_test_fall = load_np_array("test_fall")
X_test_notfall, y_test_notfall = load_np_array("test_notfall")

# Combine data
X_train = np.concatenate((X_train_fall, X_train_notfall), axis=0)
y_train = np.concatenate((y_train_fall, y_train_notfall), axis=0)
X_test = np.concatenate((X_test_fall, X_test_notfall), axis=0)
y_test = np.concatenate((y_test_fall, y_test_notfall), axis=0)

# Shuffle data
train_indices = np.arange(X_train.shape[0])
np.random.shuffle(train_indices)
X_train = X_train[train_indices]
y_train = y_train[train_indices]

test_indices = np.arange(X_test.shape[0])
np.random.shuffle(test_indices)
X_test = X_test[test_indices]
y_test = y_test[test_indices]

X_train_text = convert_to_string(X_train)
X_test_text = convert_to_string(X_test)

# Debugging: Print shapes and example data
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
print("\nExample X_train_text:", X_train_text[:2])
print("Example y_train:", y_train[:2])

X_train shape: (5824, 40, 3)
y_train shape: (5824,)
X_test shape: (2912, 40, 3)
y_test shape: (2912,)

Example X_train_text: ['1.1855469 0.5124512 0.19116211 1.0292969 0.5078125 0.35766602 0.9499512 0.7338867 0.80615234 1.1665039 0.8579102 1.0546875 1.2587892 0.7905274 1.1374512 1.0974121 0.3955078 1.1479492 0.28881836 0.20166017 1.142334 0.001220703 -0.5805664 1.279297 -0.56884766 -1.6184083 1.4101563 -0.5275879 -1.7941896 0.98657227 -0.72875977 -1.9460449 0.7573242 -0.7702637 -1.5083008 0.4157715 -0.5095215 -1.6040039 0.4375 -0.39453128 -1.4926758 0.23291016 -0.22192383 -1.0761719 0.26586914 -0.20288086 -1.3786621 0.047607422 -0.27270508 -0.9104004 -0.087890625 -0.2775879 -1.0041504 -0.07666016 -0.2932129 -0.8710938 0.011962892 -0.34057617 -0.7502442 0.13989258 -0.34057617 -0.7502442 0.13989258 -0.41137698 -0.56762695 0.23486328 -0.30639648 -0.35546875 0.42626953 -0.16040039 -0.24780275 0.7351074 -0.24316406 -0.33593753 0.7641602 -0.16113281 -0.072509766 0.80322266 -0.060058594 -0.34

In [4]:
# Tokenizer and Model
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)
max_length = 40

# Split train data into train and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42  # Adjust random_state for reproducibility
)

# Convert numerical data to strings
X_train_text_split = convert_to_string(X_train_split)
X_val_text = convert_to_string(X_val)

# Create Dataset class
train_dataset_split = FallDetectionDataset(X_train_text_split, y_train_split, tokenizer, max_length)
val_dataset = FallDetectionDataset(X_val_text, y_val, tokenizer, max_length)

# Create DataLoader
train_loader_split = DataLoader(train_dataset_split, batch_size=512, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=512)


# Create DataLoader
train_dataset = FallDetectionDataset(X_train_text, y_train, tokenizer, max_length)
test_dataset = FallDetectionDataset(X_test_text, y_test, tokenizer, max_length)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=50,  # Updated number of epochs
    per_device_train_batch_size=512,  # Updated batch size
    per_device_eval_batch_size=512,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,  # Load best model at the end
    # fp16=True,  # Enable mixed precision training
    gradient_accumulation_steps=2,  # Simulate larger batch size
    learning_rate=1e-4  # Updated learning rate
)

# Trainer with EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_split,  # Updated to use split data
    eval_dataset=val_dataset,  # Updated to use validation data
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]  # Updated early stopping
)

# Train the model
trainer.train()

# Evaluate the model and print results
results = trainer.evaluate()




Epoch,Training Loss,Validation Loss
1,No log,0.692365
2,0.692900,0.685325
3,0.692900,0.665158
4,0.668500,0.622544
5,0.668500,0.604725
6,0.616800,0.596074
7,0.616800,0.601093


In [6]:
# Inspect the tokenization process
sample_text = X_train_text[0]
encoding = tokenizer.encode_plus(
    sample_text,
    max_length=max_length,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

print("Sample text:", sample_text)
print("Tokenized input_ids:", encoding['input_ids'])
print("Tokenized attention_mask:", encoding['attention_mask'])

# Inspect a few examples from the dataset
for i in range(2):
    print(f"\nExample {i+1}")
    print("Text:", X_train_text[i])
    print("Label:", y_train[i])
    encoding = tokenizer.encode_plus(
        X_train_text[i],
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    print("Tokenized input_ids:", encoding['input_ids'].flatten().tolist())
    print("Tokenized attention_mask:", encoding['attention_mask'].flatten().tolist())

# Predict and evaluate
predictions = trainer.predict(test_dataset)  # Predict on test dataset
y_pred = np.argmax(predictions.predictions, axis=1)

# Ensure y_val and y_pred are numpy arrays
y_val = np.array(y_val)
y_pred = np.array(y_pred)

# Calculate evaluation metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred, average='weighted')
recall = recall_score(y_val, y_pred, average='weighted')
f1 = f1_score(y_val, y_pred, average='weighted')
roc_auc = roc_auc_score(y_val, predictions.predictions[:, 1], average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

# Print classification report
report = classification_report(y_val, y_pred, target_names=['Not Fall', 'Fall'], digits=4)
print("\nClassification Report:\n", report)


Sample text: 1.1855469 0.5124512 0.19116211 1.0292969 0.5078125 0.35766602 0.9499512 0.7338867 0.80615234 1.1665039 0.8579102 1.0546875 1.2587892 0.7905274 1.1374512 1.0974121 0.3955078 1.1479492 0.28881836 0.20166017 1.142334 0.001220703 -0.5805664 1.279297 -0.56884766 -1.6184083 1.4101563 -0.5275879 -1.7941896 0.98657227 -0.72875977 -1.9460449 0.7573242 -0.7702637 -1.5083008 0.4157715 -0.5095215 -1.6040039 0.4375 -0.39453128 -1.4926758 0.23291016 -0.22192383 -1.0761719 0.26586914 -0.20288086 -1.3786621 0.047607422 -0.27270508 -0.9104004 -0.087890625 -0.2775879 -1.0041504 -0.07666016 -0.2932129 -0.8710938 0.011962892 -0.34057617 -0.7502442 0.13989258 -0.34057617 -0.7502442 0.13989258 -0.41137698 -0.56762695 0.23486328 -0.30639648 -0.35546875 0.42626953 -0.16040039 -0.24780275 0.7351074 -0.24316406 -0.33593753 0.7641602 -0.16113281 -0.072509766 0.80322266 -0.060058594 -0.34423828 0.7402344 -0.07128906 -0.3820801 0.8305664 -0.07763672 -0.49731445 0.9152832 0.061523438 -0.36230472 0.9487

ValueError: Found input variables with inconsistent numbers of samples: [1165, 2912]

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Compute ROC curve and ROC area for the validation set
fpr, tpr, _ = roc_curve(y_val, predictions.predictions[:, 1])
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='grey', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()


In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Compute confusion matrix
cm = confusion_matrix(y_val, y_pred, labels=[0, 1])  # Assuming 0 is 'Not Fall' and 1 is 'Fall'

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fall', 'Fall'], yticklabels=['Not Fall', 'Fall'])
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()
