In [1]:
# !python -m venv env
# !source env/bin/activate  
!pip install torch transformers scikit-learn pandas sentencepiece
!pip install numpy==1.26.4 --force-reinstall
# needed for training
! pip install -U accelerate

import numpy as np
print(np.__version__)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl (20.6 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
Successfully installed numpy-1.26.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;

### Data Preprocessing
<a href="images/processing-step.png" target="_blank"> <img src="images/processing-step.png" alt="High-level overview of the Preprocessing Step" style="max-width: 740px;" /></a>


In [4]:
# load data and create mapping into new dataframe
# right now we are just using the user query 
### TODO files are static paths now.  we need to make them dynamic and maybe add a nice UI to select the file
import pandas as pd
import json

user_queries = pd.read_csv('user_queries.csv')
user_queries.head()

Unnamed: 0,question,json
0,Find all calls made using 3G technology.,"{'entityType': 'CDR', 'statements': [{'type': ..."
1,List all Reddit comments posted yesterday with...,"{'entityType': 'Web Activity', 'statements': [..."
2,Show me investigations that are either open or...,"{'entityType': 'Investigation', 'statements': ..."
3,Find all insights related to the witness Jane ...,"{'entityType': 'Insight', 'statements': [{'typ..."
4,List all web activities updated in the last da...,"{'entityType': 'Web Activity', 'statements': [..."


In [5]:
user_queries.describe(include="all")

Unnamed: 0,question,json
count,744,744
unique,742,721
top,Show me insights where the text includes 'witn...,"{'entityType': 'Phone', 'statements': [{'type'..."
freq,2,3


In [6]:
# prepare our data for training. we combine our user query with field description
import json
import re

def clean_json_string(json_string):
    # Remove any leading/trailing whitespace
    json_string = json_string.strip()
    
    # Ensure the string is enclosed in curly braces
    if not json_string.startswith('{'):
        json_string = '{' + json_string
    if not json_string.endswith('}'):
        json_string = json_string + '}'
    
    # Replace single quotes with double quotes, but not within values
    json_string = re.sub(r"(?<!\\)'", '"', json_string)
    
    # Remove any trailing commas before closing braces or brackets
    json_string = re.sub(r',\s*([\]}])', r'\1', json_string)
    
    return json.loads(json_string)


In [9]:
import json

def extract_label_from_json(json_str):
    """
    Extracts the 'entityType' and 'relationTargetType' from the given JSON string.
    If 'relationTargetType' is missing, the label will only include 'entityType'.
    """
    try:
        json_data = clean_json_string(json_str)
    except json.JSONDecodeError as e:
        # print(f"Error decoding JSON for query: {json_str}")
        # print(f"Error: {e}")
        json_data = {}

    entity_type = json_data.get('entityType', '')
    relation_type = json_data.get('statements', [{}])[0].get('parameters', {}).get('relationTargetType', [''])[0]

    # Combine entity and relation types into a single label
    label = entity_type if not relation_type else f"{entity_type}|{relation_type}"
    return label

# Apply the function to extract labels from the JSON column
user_queries['label'] = user_queries['json'].apply(extract_label_from_json)

# Inspect the DataFrame to ensure labels are extracted correctly
user_queries[['question', 'label']].head()


Unnamed: 0,question,label
0,Find all calls made using 3G technology.,CDR
1,List all Reddit comments posted yesterday with...,Web Activity
2,Show me investigations that are either open or...,Investigation
3,Find all insights related to the witness Jane ...,Insight
4,List all web activities updated in the last da...,Web Activity


In [11]:
# Encode the labels into numeric format
user_queries['label_numeric'], label_mapping = pd.factorize(user_queries['label'])

# Store the label mapping for future reference (useful during inference)
print("Label Mapping:", dict(enumerate(label_mapping)))

# Inspect the encoded labels
user_queries[['label', 'label_numeric']].head()


Label Mapping: {0: 'CDR', 1: 'Web Activity', 2: 'Investigation', 3: 'Insight', 4: 'Phone', 5: '', 6: 'Report', 7: 'Person', 8: 'Web Actor', 9: 'EVisa Request', 10: 'Web Activity|Web Actor', 11: 'CDR|Phone', 12: 'Phone|CDR'}


Unnamed: 0,label,label_numeric
0,CDR,0
1,Web Activity,1
2,Investigation,2
3,Insight,3
4,Web Activity,1


In [13]:
from transformers import AlbertTokenizer
import torch

# Initialize the ALBERT tokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

def tokenize_input(query):
    """
    Tokenizes the given input query using the ALBERT tokenizer.
    Returns input_ids and attention_mask as PyTorch tensors.
    """
    return tokenizer(
        query,
        truncation=True,        # Truncate sequences longer than max length
        padding='max_length',   # Pad shorter sequences to the max length
        max_length=128,         # Set a consistent sequence length
        return_tensors="pt"     # Return as PyTorch tensors
    )

# Tokenize all input queries
tokenized_data = user_queries['question'].apply(tokenize_input)

# Convert the tokenized data into input_ids and attention_masks
input_ids = torch.cat([x['input_ids'] for x in tokenized_data], dim=0)
attention_masks = torch.cat([x['attention_mask'] for x in tokenized_data], dim=0)

print(f"Input IDs shape: {input_ids.shape}")
print(f"Attention Mask shape: {attention_masks.shape}")


Input IDs shape: torch.Size([744, 128])
Attention Mask shape: torch.Size([744, 128])


In [15]:
from sklearn.model_selection import train_test_split

# Extract labels as tensors
labels = torch.tensor(user_queries['label_numeric'].values)

# Split the data into train, validation, and test sets (80/10/10 split)
train_inputs, temp_inputs, train_labels, temp_labels = train_test_split(
    input_ids, labels, test_size=0.2, random_state=42
)

val_inputs, test_inputs, val_labels, test_labels = train_test_split(
    temp_inputs, temp_labels, test_size=0.5, random_state=42
)

print(f"Training set size: {train_inputs.shape[0]}")
print(f"Validation set size: {val_inputs.shape[0]}")
print(f"Test set size: {test_inputs.shape[0]}")


Training set size: 595
Validation set size: 74
Test set size: 75


### Training 
<a href="images/training-step.png" target="_blank"> <img src="images/training-step.png" alt="High-level overview of the Training Step" style="max-width: 740px;" /></a>

In [16]:
import torch
from torch.utils.data import Dataset

class QueryDataset(Dataset):
    """
    A custom Dataset class to handle the input_ids, attention_masks, and labels.
    """

    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)  # Total number of samples

    def __getitem__(self, idx):
        # Fetch the inputs, masks, and labels by index
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

# Create Dataset instances for train, validation, and test sets
train_dataset = QueryDataset(train_inputs, train_inputs, train_labels)
val_dataset = QueryDataset(val_inputs, val_inputs, val_labels)
test_dataset = QueryDataset(test_inputs, test_inputs, test_labels)


In [17]:
from transformers import AlbertForSequenceClassification, AlbertConfig

# Get the number of unique labels
num_labels = len(torch.unique(train_labels))

# Create a configuration object with the correct number of labels
config = AlbertConfig.from_pretrained("albert-base-v2", num_labels=num_labels)

# Load the ALBERT model for sequence classification
model = AlbertForSequenceClassification.from_pretrained("albert-base-v2", config=config)

# Move the model to the appropriate device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertSdpaAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=

In [18]:
# prepare results folder
import os
from datetime import datetime

# Create a unique output directory
base_output_dir = "./results"
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = os.path.join(base_output_dir, f"run_{current_time}")

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

In [19]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    gradient_accumulation_steps=4,
    save_total_limit=2,
    load_best_model_at_end=True,
    weight_decay=0.01, # Weight decay to reduce overfitting
    no_cuda=True,           
    logging_dir="./logs"   
)

# Initialize the Trainer instance
trainer = Trainer(
    model=model,                       # ALBERT model to be trained
    args=training_args,                # Training arguments
    train_dataset=train_dataset,       # Training dataset
    eval_dataset=val_dataset           # Validation dataset
)




In [20]:
print("Starting training...")
trainer.train()


Starting training...


100%|██████████| 222/222 [13:31<00:00,  3.66s/it]

{'train_runtime': 811.9126, 'train_samples_per_second': 2.199, 'train_steps_per_second': 0.273, 'train_loss': 2.238061440957559, 'epoch': 2.98}





TrainOutput(global_step=222, training_loss=2.238061440957559, metrics={'train_runtime': 811.9126, 'train_samples_per_second': 2.199, 'train_steps_per_second': 0.273, 'total_flos': 10610332942848.0, 'train_loss': 2.238061440957559, 'epoch': 2.9798657718120807})

In [21]:
should_save = False
if should_save:
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model and tokenizer saved to {output_dir}")
else:
    print(f"No Trainer that's ok. We just won't save")


Model and tokenizer saved to ./results/run_20241015_153838


In [24]:
from sklearn.metrics import classification_report, accuracy_score, f1_score
import numpy as np

def evaluate_and_print_metrics(trainer, dataset):
    """
    Evaluates the model on the provided dataset and prints key metrics.

    Parameters:
    - trainer: Hugging Face Trainer instance with the trained model.
    - dataset: Dataset to evaluate (usually the test set).

    Returns:
    - metrics: Dictionary containing accuracy, F1-score, and classification report.
    """

    # Step 1: Use the trainer to predict the labels for the dataset.
    predictions = trainer.predict(dataset)

    # Step 2: Extract logits and convert them to predicted labels.
    logits = predictions.predictions  # Raw model outputs
    predicted_labels = np.argmax(logits, axis=1)  # Predicted class indices

    # Step 3: Extract the true labels from the dataset.
    true_labels = np.array([item['labels'].item() for item in dataset])

    # Step 4: Calculate key metrics.
    accuracy = accuracy_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    report = classification_report(true_labels, predicted_labels)

    # Print the metrics.
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-Score (Weighted): {f1:.4f}")
    print("\nClassification Report:\n", report)

    # Return the metrics for further analysis if needed.
    return {"accuracy": accuracy, "f1_score": f1, "classification_report": report}

# Example usage:
print("Evaluating the model on the test set...")
test_metrics = evaluate_and_print_metrics(trainer, test_dataset)


Evaluating the model on the test set...


100%|██████████| 38/38 [00:13<00:00,  2.75it/s]

Accuracy: 0.2667
F1-Score (Weighted): 0.1716

Classification Report:
               precision    recall  f1-score   support

           0       0.42      0.70      0.53        20
           1       0.14      0.60      0.23        10
           2       0.00      0.00      0.00         7
           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00         8
           5       0.00      0.00      0.00         5
           6       0.00      0.00      0.00         5
           7       0.00      0.00      0.00         5
           8       0.00      0.00      0.00         4
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         3

    accuracy                           0.27        75
   macro avg       0.05      0.11      0.06        75
weighted avg       0.13      0.27      0.17        75




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
# Set the model to evaluation mode
model.eval()

# Optional: Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def predict(query, tokenizer, model, label_mapping):
    """
    Perform inference on a given query and return the predicted label.

    Parameters:
    - query: The user query (string) to predict.
    - tokenizer: The ALBERT tokenizer instance.
    - model: The trained ALBERT model instance.
    - label_mapping: Dictionary to map numeric labels back to original class names.

    Returns:
    - predicted_label: The predicted label as a string.
    """

    # Step 2: Tokenize the input query
    inputs = tokenizer(
        query,
        truncation=True,            # Truncate if input is too long
        padding="max_length",        # Pad to the max length
        max_length=128,              # Ensure consistent length
        return_tensors="pt"          # Return as PyTorch tensors
    )

    # Move inputs to the appropriate device (GPU/CPU)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Step 3: Perform inference (disable gradient calculation for speed)
    with torch.no_grad():
        logits = model(input_ids, attention_mask=attention_mask).logits

    # Step 4: Convert logits to predicted class index
    predicted_class_idx = torch.argmax(logits, dim=1).item()

    # Step 5: Map the predicted class index to the label
    predicted_label = label_mapping[predicted_class_idx]

    return predicted_label

# Example label mapping (this should match what you used during training)
label_mapping = {
    0: "CDR|Phone",
    1: "CDR",
    2: "Investigation",
    3: "Report",
    4: "Web Activity",
    5: "Person",
    6: "EVisa Request",
    7: "Insight"
}



Input Query: What SMS messages were sent from suspicious phones to 0549876543 containing the word 'urgent'?
Predicted Label: Person


In [30]:
# Example usage of the inference function
query = "who uses 3g"
predicted_label = predict(query, tokenizer, model, label_mapping)

print(f"Input Query: {query}")
print(f"Predicted Label: {predicted_label}")

Input Query: who uses 3g
Predicted Label: Insight
