In [None]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score

# Load your dataset
train_data = pd.read_csv('raw182_Training_Relabeled_Auto_25.csv')
test_data = pd.read_csv('raw91_Testing_Relabeled_Auto_25.csv')

# Strip leading and trailing spaces from column names
train_data.columns = train_data.columns.str.strip()
test_data.columns = test_data.columns.str.strip()

# Concatenate accelerometer data into a single string
def concatenate_accelerometer_data(data):
    data['text'] = data.apply(lambda row: f"{row['ms_accelerometer_x']} {row['ms_accelerometer_y']} {row['ms_accelerometer_z']}", axis=1)
    return data

train_data = concatenate_accelerometer_data(train_data)
test_data = concatenate_accelerometer_data(test_data)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
def tokenize_data(data):
    input_ids = []
    attention_masks = []

    for sentence in data['text']:
        encoded_dict = tokenizer.encode_plus(
                            sentence,
                            add_special_tokens=True,
                            max_length=64,          # Adjust max_length according to your needs
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt'
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(data['outcome'].values)

    return TensorDataset(input_ids, attention_masks, labels)

train_dataset = tokenize_data(train_data)
test_dataset = tokenize_data(test_data)

# Create DataLoader
batch_size = 32

train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False
)

# Initialize BERT model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,  # Binary classification
    output_attentions=False,
    output_hidden_states=False,
)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Define training loop
def train(model, dataloader, optimizer, epochs=3):
    model.train()

    for epoch in range(epochs):
        total_loss = 0

        for batch in dataloader:
            b_input_ids, b_input_mask, b_labels = batch

            model.zero_grad()

            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels)

            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch + 1}: Average training loss: {avg_train_loss}")

# Train the model
train(model, train_dataloader, optimizer)

# Function to predict with heuristic
def predict_with_heuristic(model, dataloader, threshold=0.5):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            b_input_ids, b_input_mask, b_labels = batch

            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)

            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=1)[:, 1]  # Get probabilities for the positive class

            predictions.extend(probs.cpu().numpy())
            true_labels.extend(b_labels.cpu().numpy())

    # Apply heuristic
    heuristic_predictions = []
    for i in range(len(predictions) - 9):
        avg_prob = np.mean(predictions[i:i+10])
        heuristic_predictions.append(1 if avg_prob > threshold else 0)

    return heuristic_predictions, true_labels[9:]  # Offset true labels by 9

# Make predictions with heuristic
heuristic_predictions, true_labels = predict_with_heuristic(model, test_dataloader)

# Calculate accuracy
accuracy = accuracy_score(true_labels, heuristic_predictions)
print(f"Heuristic prediction accuracy: {accuracy:.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Average training loss: 0.43720438703894615
Epoch 2: Average training loss: 0.4340018564835191
Epoch 3: Average training loss: 0.41829161811619997
Heuristic prediction accuracy: 0.9490


In [None]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score

# Load your dataset
train_data = pd.read_csv('raw182_Training_Relabeled_Auto_25.csv')
test_data = pd.read_csv('raw91_Testing_Relabeled_Auto_25.csv')

# Strip leading and trailing spaces from column names
train_data.columns = train_data.columns.str.strip()
test_data.columns = test_data.columns.str.strip()

# Sample balanced datasets
def sample_balanced_data(data, fall_count, no_fall_count):
    fall_data = data[data['outcome'] == 1].sample(fall_count)
    no_fall_data = data[data['outcome'] == 0].sample(no_fall_count)
    return pd.concat([fall_data, no_fall_data])

train_data = sample_balanced_data(train_data, fall_count=4550, no_fall_count=4550)
test_data = sample_balanced_data(test_data, fall_count=2275, no_fall_count=2275)

# Concatenate accelerometer data into a single string
def concatenate_accelerometer_data(data):
    data['text'] = data.apply(lambda row: f"{row['ms_accelerometer_x']} {row['ms_accelerometer_y']} {row['ms_accelerometer_z']}", axis=1)
    return data

train_data = concatenate_accelerometer_data(train_data)
test_data = concatenate_accelerometer_data(test_data)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
def tokenize_data(data):
    input_ids = []
    attention_masks = []

    for sentence in data['text']:
        encoded_dict = tokenizer.encode_plus(
                            sentence,
                            add_special_tokens=True,
                            max_length=64,          # Adjust max_length according to your needs
                            padding='max_length',
                            return_attention_mask=True,
                            return_tensors='pt'
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(data['outcome'].values)

    return TensorDataset(input_ids, attention_masks, labels)

train_dataset = tokenize_data(train_data)
test_dataset = tokenize_data(test_data)

# Create DataLoader
batch_size = 32

train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False
)

# Initialize BERT model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,  # Binary classification
    output_attentions=False,
    output_hidden_states=False,
)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Define training loop
def train(model, dataloader, optimizer, epochs=3):
    model.train()

    for epoch in range(epochs):
        total_loss = 0

        for batch in dataloader:
            b_input_ids, b_input_mask, b_labels = batch

            model.zero_grad()

            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels)

            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch + 1}: Average training loss: {avg_train_loss}")

# Train the model
train(model, train_dataloader, optimizer)

# Function to predict with heuristic
def predict_with_heuristic(model, dataloader, threshold=0.5):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            b_input_ids, b_input_mask, b_labels = batch

            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)

            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=1)[:, 1]  # Get probabilities for the positive class

            predictions.extend(probs.cpu().numpy())
            true_labels.extend(b_labels.cpu().numpy())

    # Apply heuristic
    heuristic_predictions = []
    for i in range(len(predictions) - 9):
        avg_prob = np.mean(predictions[i:i+10])
        heuristic_predictions.append(1 if avg_prob > threshold else 0)

    return heuristic_predictions, true_labels[9:]  # Offset true labels by 9

# Make predictions with heuristic
heuristic_predictions, true_labels = predict_with_heuristic(model, test_dataloader)

# Calculate accuracy
accuracy = accuracy_score(true_labels, heuristic_predictions)
print(f"Heuristic prediction accuracy: {accuracy:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Average training loss: 0.6381284956346478
Epoch 2: Average training loss: 0.5808539657216323
Epoch 3: Average training loss: 0.5586982271127533
Heuristic prediction accuracy: 0.9795
