In [26]:
# !python -m venv env
# !source env/bin/activate  
!pip install torch transformers scikit-learn pandas sentencepiece
!pip install numpy==1.26.4 --force-reinstall
# needed for training
! pip install -U accelerate

import numpy as np
print(np.__version__)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl (20.6 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
Successfully installed numpy-1.26.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;

### Sample Case for Getting Familiar with Data and Model

In [None]:
# Sample JSON-like data (you'll replace this with your CSV data)
json_data = [
    {"entityType": "CDR", "relationTargetType": "Phone"},
    {"entityType": "Report", "relationTargetType": "Malware"}
]

# Example query from the user
query = "What SMS messages were sent from suspicious phones to 0549876543 containing 'urgent'?"


In [None]:
#Function to search for relevant entitties in teh JSON data
def find_matching_entities(query, json_data):
    matching_entities = []

    for record in json_data:
        entity_text = f"{record['entityType']} {record['relationTargetType']}"

        #encode inputs for model
        inputs = tokenizer(query, entity_text, return_tensors="pt")

        #run the model to get answer scores
        with torch.no_grad():
            outputs = model(**inputs)

        #get the start and end scores for the answer
        answer_start = torch.argmax(outputs.start_logits)
        answer_end = torch.argmax(outputs.end_logits) + 1

        #extract the answer
        predicted_entity = tokenizer.convert_tokens_to_string(
            tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end])
        )

        #if predicted entity is not empty, consider it a match
        if predicted_entity.strip():
            return record['entityType'], record['relationTargetType']

    return None
    # return list(set(matching_entities)) #remove duplicates

In [None]:
#Test example
matching_entities = list(set(find_matching_entities(query, json_data)))
print(matching_entities)

### Training using data sources

In [27]:
# load data and create mapping into new dataframe
# right now we are just using the user query 
### TODO files are static paths now.  we need to make them dynamic and maybe add a nice UI to select the file
import pandas as pd

fields_desc = pd.read_csv('fields_description.csv')
user_queries = pd.read_csv('user_queries.csv')

print(fields_desc.head())
print(user_queries.head())

# Create a dictionary mapping entity names to their field descriptions and properties
# This groups the data by entity_name and creates a nested dictionary structure for easy access to field information for each entity type
entity_to_field_mapping = fields_desc.groupby('entity_name').apply(lambda x: x[['field_name', 'description']].to_dict(orient='records')).to_dict()

print('test sample mapping')
print(entity_to_field_mapping.get('Phone', []))


  entity_name                   field_name field_type  \
0         CDR      ifc.ootb.CDR.callStatus     string   
1         CDR             ifc.CDR.caseCode     string   
2         CDR            ifc.CDR.chatTopic     string   
3         CDR  ifc.ootb.CDR.createDateTime       date   
4         CDR       ifc.ootb.CDR.direction     string   

                                         description  
0  Status of the call: "Successful", "Failed", "B...  
1            Unique code identifying a specific case  
2         Topic or subject of discussion in the chat  
3                  Date and time of record creation.  
4         Direction of the call (incoming, outgoing)  
                                            question  \
0           Find all calls made using 3G technology.   
1  List all Reddit comments posted yesterday with...   
2  Show me investigations that are either open or...   
3  Find all insights related to the witness Jane ...   
4  List all web activities updated in the last 

  entity_to_field_mapping = fields_desc.groupby('entity_name').apply(lambda x: x[['field_name', 'description']].to_dict(orient='records')).to_dict()


In [None]:
# prepare our data for training. we combine our user query with field description
import json
import re

def clean_json_string(json_string):
    # Remove any leading/trailing whitespace
    json_string = json_string.strip()
    
    # Ensure the string is enclosed in curly braces
    if not json_string.startswith('{'):
        json_string = '{' + json_string
    if not json_string.endswith('}'):
        json_string = json_string + '}'
    
    # Replace single quotes with double quotes, but not within values
    json_string = re.sub(r"(?<!\\)'", '"', json_string)
    
    # Remove any trailing commas before closing braces or brackets
    json_string = re.sub(r',\s*([\]}])', r'\1', json_string)
    
    return json_string

def prepare_data_for_training(user_query, fields_mapping):
    inputs, labels = [], []

    for _, row in user_queries.iterrows():
        query = row['question']
        cleaned_json_string = clean_json_string(row['json'])
        
        try:
            json_data = json.loads(cleaned_json_string)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON for query: {query}")
            print(f"Error: {e}")
            continue  # Skip this row and continue with the next one
        
        # extract entity types and relation target types
        entity_type = json_data.get('entityType', '')
        relation_type = json_data.get('relationTargetType', '')

        # get the description for each entity type
        fields = fields_mapping.get(entity_type, [])
        field_descriptions = ';'.join([f"{field['field_name']}: {field['description']}" for field in fields])

        #combine query with descriptions
        input_text = f"Query: {query}. Entity: {entity_type}. Fields: {field_descriptions}"
        inputs.append(input_text)
        labels.append(entity_type if not relation_type else f"{entity_type}|{relation_type}")

    return inputs, labels

# prepare data
inputs, labels = prepare_data_for_training(user_queries, entity_to_field_mapping)

In [None]:
for i in range(3):
    print(f"Input {i+1}: {inputs[i]}")
    print(f"Label {i+1}: {labels[i]}")

In [None]:
# train test split
from sklearn.model_selection import train_test_split

train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    inputs, labels, test_size=0.2, stratify=labels, random_state=42
)

print(f"Training size: {len(train_inputs)}, Validation size: {len(val_inputs)}")

In [None]:
from transformers import AlbertTokenizer
import torch

model_name = "twmkn9/albert-base-v2-squad2"
tokenizer = AlbertTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(train_inputs, truncation=True, padding=True, return_tensors="pt")
val_encodings = tokenizer(val_inputs, truncation=True, padding=True, return_tensors="pt")

# labels to tensors. we can't have names only machine values
# remove duplicates and iterate through to assign a number
unique_labels = list(set(labels))
train_labels_tensor = torch.tensor([unique_labels.index(lbl) for lbl in train_labels])
val_labels_tensor = torch.tensor([unique_labels.index(lbl) for lbl in val_labels])


In [None]:
# do our dimensions match?
print(f"Training encodings: {train_encodings['input_ids'].shape}, Labels: {train_labels_tensor.shape}")
print(f"Validation encodings: {val_encodings['input_ids'].shape}, Labels: {val_labels_tensor.shape}")

In [None]:
from torch.utils.data import Dataset

class EntityDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }
    


In [None]:
# Create dataset objects for training and validation
train_dataset = EntityDataset(train_encodings, train_labels_tensor)
val_dataset = EntityDataset(val_encodings, val_labels_tensor)

# check the first sample from the training dataset
print(train_dataset[0])


In [None]:
# prepare results folder
import os
from datetime import datetime

# Create a unique output directory
base_output_dir = "./results"
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = os.path.join(base_output_dir, f"run_{current_time}")

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

In [None]:
# handle states of our data
import pickle

# Save training dataset
with open("train_dataset.pkl", "wb") as f:
    pickle.dump(train_dataset, f)

# Save validation dataset
with open("val_dataset.pkl", "wb") as f:
    pickle.dump(val_dataset, f)

In [None]:
# model and training run
# check for local first
from transformers import AlbertForSequenceClassification, AlbertTokenizer
from transformers import TrainingArguments, Trainer
import pickle
import torch
import os

# output_dir = "./results/run_20241014_163059"

training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    gradient_accumulation_steps=4,
    save_total_limit=2,
    load_best_model_at_end=True,
    weight_decay=0.01,
    no_cuda=True,
)

# Assuming Training is already done and we have models locally or configuration vars
if os.path.exists(output_dir) and os.path.exists(output_dir + "/spiece.model"):
    print(f"Loading model from {output_dir}")
    model = AlbertForSequenceClassification.from_pretrained(output_dir)
    tokenizer = AlbertTokenizer.from_pretrained(output_dir)

    print(f"Model loaded from {output_dir}")
    print(f"Model: {model}")

    # Reload training dataset
    with open("train_dataset.pkl", "rb") as f:
        train_dataset = pickle.load(f)

    # Reload validation dataset
    with open("val_dataset.pkl", "rb") as f:
        val_dataset = pickle.load(f)

    print(f"Train dataset size: {len(train_dataset)}")
    print(f"Validation dataset size: {len(val_dataset)}")

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
    )
else:
    print(f"Training new model")
    model = AlbertForSequenceClassification.from_pretrained(model_name)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
    )
    trainer.train()


print(model)

In [None]:
should_save = False
if should_save:
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model and tokenizer saved to {output_dir}")
else:
    print(f"No Trainer that's ok. We just won't save")


In [None]:
# evaluation
from sklearn.metrics import classification_report, f1_score

def evaluate_model(trainer, val_dataset, unique_labels):
    # get predictions
    preds = trainer.predict(val_dataset)

    # covert ML output to labels
    preds_labels = torch.argmax(torch.tensor(preds.predictions), dim=1).numpy()

    # extract TRUE labels
    true_labels = [val_dataset[i]['labels'].item() for i in range(len(val_dataset))]

    #compute F1 score
    f1 = f1_score(true_labels, preds_labels, average='weighted')

    # print report
    print(f"weighted f1 score: {f1}")
    print("Classification Report:\n")
    print(classification_report(true_labels, preds_labels, target_names=unique_labels))


In [None]:
evaluate_model(trainer, val_dataset, unique_labels)

In [None]:
# inference on trained model
def infer(model, tokenizer, query, unique_labels):
    # Tokenize the input query
    inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True)

    # Perform inference using the model
    outputs = model(**inputs)

    # Get the predicted label ID
    predicted_label_id = torch.argmax(outputs.logits, dim=1).item()

    # Convert the label ID back to the original label name
    predicted_label = unique_labels[predicted_label_id]

    return predicted_label


In [None]:
# Test the inference function with a sample query
sample_query = "Find all calls made using 4G technology."
predicted_entity = infer(model, tokenizer, sample_query, unique_labels)

print(f"Predicted Entity for Query: {sample_query}")
print(f"Predicted Entity Type: {predicted_entity}")

In [None]:
!pip install numpy==1.26.4 --force-reinstall
!pip show numpy

In [None]:
# Check the number of unique labels in your dataset
print(f"Unique Labels: {unique_labels}")
print(f"Number of Classes (num_labels): {len(unique_labels)}")

# Convert labels to integers from 0 to len(unique_labels) - 1
label_ids = torch.tensor([unique_labels.index(lbl) for lbl in labels])

# Verify label IDs are within range
print(f"Label IDs: {label_ids}")
print(f"Max Label ID: {label_ids.max()}, Expected: {len(unique_labels) - 1}")
