- Can handle batch inference
- Tokenizes the prompts before inference, to speed up inference
- Use an hf dataset during tokenization because it supports multi-processing
- Sorts rows by text length so that batches have a similar max length
- Manually pads the inputs


In [1]:
!pip install -q bitsandbytes
!pip install -q accelerate

In [2]:
import pandas as pd
import numpy as np
import os
import json
import time

import gc
import re
from tqdm import tqdm

from datasets import Dataset

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BatchEncoding
import torch


import torch
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          #BitsAndBytesConfig, 
                          AutoConfig)

# Don't Show Warning Messages
import warnings
warnings.filterwarnings('ignore')

print(f"CUDA Version: {torch.version.cuda}")
print(f"Pytorch {torch.__version__}")

CUDA Version: 12.3
Pytorch 2.4.0


In [43]:
# Note:
# Use cuda:0 for the device because the DriveData
# environment has only one T4 GPU.

MODEL_PATH = '../input/exp27-youth-download-llama-3-1-8b-8bit/Llama-3.1-8B-Instruct-8bit'

# Use for testing on a small sample of the data
RUN_TEST = True
NUM_SAMPLES = 100

BATCH_SIZE = 2

NUM_FOLDS = 5

# Run inference on fold 0
TEST_FOLD = 0

DEVICE = "cuda:0"
#DEVICE = "cpu"

In [4]:
# Set a seed value

import torch, random

# Ensure that all GPU operations are deterministic 
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

seed_val = 1024

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [5]:
base_path = "../input/driven-data-youth-mental-health-narratives/youth-mental-health-narratives/automated-abstraction-track/"

In [6]:
import os

os.listdir('../input/exp27-youth-download-llama-3-1-8b-8bit/Llama-3.1-8B-Instruct-8bit')

['model.safetensors.index.json',
 'config.json',
 'model-00001-of-00002.safetensors',
 'model-00002-of-00002.safetensors',
 'tokenizer.json',
 'tokenizer_config.json',
 'special_tokens_map.json',
 'generation_config.json']

In [7]:
# Check the type and quantity of GPUs

if torch.cuda.is_available():
    print('Num CPUs:', os.cpu_count())
    print('Num GPUs:', torch.cuda.device_count())
    print('GPU Type:', torch.cuda.get_device_name(0))

Num CPUs: 4
Num GPUs: 2
GPU Type: Tesla T4


## Helper functions

In [8]:
def timer(start_time):

    # End timing
    end_time = time.time()
    # Calculate the elapsed time
    elapsed_time = end_time - start_time
    # round to one decimal place
    elapsed_time = round(elapsed_time, 1)
    
    return elapsed_time

"""
# Start timing
start_time = time.time()

# Some code

# Get the inference time
elapsed_time = timer(start_time)
print(f"Time taken: {elapsed_time} seconds")
"""

'\n# Start timing\nstart_time = time.time()\n\n# Some code\n\n# Get the inference time\nelapsed_time = timer(start_time)\nprint(f"Time taken: {elapsed_time} seconds")\n'

In [9]:
# Ref: https://github.com/drivendataorg/youth-mental-health-runtime/blob/main/src/scoring.py

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score


def average_f1(predictions: pd.DataFrame, labels: pd.DataFrame):
    """Score a set of predictions using the competition metric. F1 is averaged
    across all target variables. For categorical variables, micro-averaged
    F1 score is used.

    Args:
        predictions (pd.DataFrame): Dataframe of predictions, with one column
            for each target variable. The index should be the uid.
        labels (pd.DataFrame): Dataframe of ground truth values, with one column
            for each target variable. The index should be the uid.
    """
    # Check that there are 23 target variables
    assert predictions.shape[1] == 23

    # Check that column order and row order are the same
    assert (predictions.columns == labels.columns).all()
    assert (predictions.index == labels.index).all()

    # All values should be integers
    assert (predictions.dtypes == int).all()

    CATEGORICAL_VARS = ["InjuryLocationType", "WeaponType1"]
    BINARY_VARS = np.setdiff1d(labels.columns, CATEGORICAL_VARS)

    # Calculate F1 score averaged across binary variables
    binary_f1 = f1_score(
        labels[BINARY_VARS],
        predictions[BINARY_VARS],
        average="macro",
    )
    f1s = [binary_f1]

    # Calculate F1 score for each categorical variable
    for cat_col in CATEGORICAL_VARS:
        f1s.append(f1_score(labels[cat_col], predictions[cat_col], average="micro"))

    return np.average(f1s, weights=[len(BINARY_VARS), 1, 1])


def calculate_f1(df_predictions, df_labels):
    """
    Note: 
    1. Set UID as the index
    2. The dataframes should only contain the binary columns
    """
    #predictions = pd.read_csv(predictions_path, index_col="uid")
    #labels = pd.read_csv(labels_path, index_col="uid")

    score = average_f1(df_predictions, df_labels)
    
    #print(f"Variable-averaged F1 score: {score:.4f}")
    
    return score

In [10]:
import numpy as np

def biased_coin(p_zero=0.5):
    """
    Returns 0 or 1, with the probability of returning 0 controlled by `p_zero`.

    Args:
        p_zero (float): Probability of returning 0. Must be between 0 and 1.

    Returns:
        int: 0 or 1
    """
    return np.random.choice([0, 1], p=[p_zero, 1 - p_zero])

# Example usage:
for _ in range(10):
    print(biased_coin(p_zero=0.7))  # 70% chance of 0, 30% chance of 1


0
1
0
0
0
1
0
0
0
0


In [11]:
def assign_batch_numbers(df, batch_size):
    """
    Assigns a batch number to each row in the DataFrame based on the specified batch size.

    Args:
    - df (pd.DataFrame): The input dataframe to which batch numbers will be assigned.
    - batch_size (int): The number of rows in each batch.

    Returns:
    - pd.DataFrame: A dataframe with an additional 'batch_number' column.
    """
    # Create a new column 'batch_number' where batch number is assigned to each row
    df['batch_number'] = (df.index // batch_size) #+ 1
    return df

# Example Usage
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace'],
    'Age': [25, 30, 20, 35, 28, 40, 22]
}

df = pd.DataFrame(data)

# Assign batch numbers with a batch size of 3
df_with_batches = assign_batch_numbers(df, batch_size=3)

num_batches = df_with_batches['batch_number'].nunique()

print(num_batches)

df_with_batches.head(10)


3


Unnamed: 0,Name,Age,batch_number
0,Alice,25,0
1,Bob,30,0
2,Charlie,20,0
3,David,35,1
4,Eva,28,1
5,Frank,40,1
6,Grace,22,2


## Load the data

In [12]:
path = base_path + 'train_features_X4juyT6.csv'
df_train_features = pd.read_csv(path)

path = base_path + 'train_labels_JxtENGl.csv'
df_train_labels = pd.read_csv(path)


# Use merge instead of concat
# just in case the order of the
# dataframes is different.

df_data = pd.merge(df_train_features, df_train_labels, on='uid')

print(df_data.shape)

#df_data.head()

(4000, 26)


In [13]:
df_data.columns

Index(['uid', 'NarrativeLE', 'NarrativeCME', 'DepressedMood',
       'MentalIllnessTreatmentCurrnt', 'HistoryMentalIllnessTreatmnt',
       'SuicideAttemptHistory', 'SuicideThoughtHistory',
       'SubstanceAbuseProblem', 'MentalHealthProblem', 'DiagnosisAnxiety',
       'DiagnosisDepressionDysthymia', 'DiagnosisBipolar', 'DiagnosisAdhd',
       'IntimatePartnerProblem', 'FamilyRelationship', 'Argument',
       'SchoolProblem', 'RecentCriminalLegalProblem', 'SuicideNote',
       'SuicideIntentDisclosed', 'DisclosedToIntimatePartner',
       'DisclosedToOtherFamilyMember', 'DisclosedToFriend',
       'InjuryLocationType', 'WeaponType1'],
      dtype='object')

## Pre-process the data

## Identify rows where the two narrative columns are identical

In [14]:
def check_for_duplication(row):
    
    NarrativeLE = row['NarrativeLE'].strip()
    NarrativeCME = row['NarrativeCME'].strip()
    
    if NarrativeLE == NarrativeCME:
        return 'is_duplicated'
    else:
        return 'not_duplicated'
    
df_data['dup_narratives'] = df_data.apply(check_for_duplication, axis=1)

df_data['dup_narratives'].value_counts()

dup_narratives
not_duplicated    3900
is_duplicated      100
Name: count, dtype: int64

In [15]:
df = df_data[df_data['dup_narratives'] == 'is_duplicated']

df = df.reset_index(drop=True)

df.shape

(100, 27)

In [16]:
df.loc[2, 'NarrativeLE']

"V was a XX XX who hanged himself at his girlfriend's residence. V had been fired the day prior from his job. V left a suicide note with goodbye and love messages for family. V had an unspecified history of suicide attempts. No further circumstances."

In [17]:
df.loc[2, 'NarrativeCME']

"V was a XX XX who hanged himself at his girlfriend's residence. V had been fired the day prior from his job. V left a suicide note with goodbye and love messages for family. V had an unspecified history of suicide attempts. No further circumstances."

In [18]:
# Remove the rows where the narratives are duplicated
df_data = df_data[df_data['dup_narratives'] == 'not_duplicated']

# Drop the 'dup_narratives' column
df_data = df_data.drop('dup_narratives', axis=1)

df_data = df_data.reset_index(drop=True)

df_data.shape

(3900, 26)

## Get the string length
Later we will use this when doing batching.

In [19]:
def get_total_text_length(row):
    
    text1_len = len(row['NarrativeLE'])
    text2_len = len(row['NarrativeCME'])
    
    total = text1_len + text2_len
    
    return total

df_data['text_length'] = df_data.apply(get_total_text_length, axis=1)

print(df_data['text_length'].max())
print(df_data['text_length'].mean())
print(df_data['text_length'].min())

8275
1770.8687179487179
386


In [20]:
# Example

df = df_data[df_data['text_length'] <= 386]

df = df.reset_index(drop=True)


print(df.loc[0, 'NarrativeLE'])
print()
print(df.loc[0, 'NarrativeCME'])

Victim is a XX XX. Victim was diagnosed with bi polar disorder. Victim was homeless. Victim was found hanging by the neck in a loading bay near a grain elevator. Manner of death is suicide.

Victim1, a XX XX XX was found hanging in a loading bay adjacent to a grain elevator.  Toxicology reports positive for Benzoylecgonine, Amphetamines and Methamphetamine.  Manner of death is suicide.


## Create the 5 folds

In [21]:
df_data.shape

(3900, 27)

In [22]:
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedGroupKFold

skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=101)

for fold, ( _, val_) in enumerate(skf.split(X=df_data, y=df_data.DepressedMood)):
      df_data.loc[val_ , "fold"] = fold
        
df_data['fold'].value_counts()

fold
3.0    780
0.0    780
2.0    780
1.0    780
4.0    780
Name: count, dtype: int64

## Save the five folds

In [23]:
path = 'five_folds.csv'
df_data.to_csv(path, index=False)

In [24]:
!ls

five_folds.csv


## How to sort and reorder a dataframe

In [25]:
# Save the original order
orig_order = list(df_data['uid'])

#orig_order

In [26]:
# Sort by text_length

df_sorted = df_data.sort_values(by='text_length', ascending=True)

df_sorted.head(2)

Unnamed: 0,uid,NarrativeLE,NarrativeCME,DepressedMood,MentalIllnessTreatmentCurrnt,HistoryMentalIllnessTreatmnt,SuicideAttemptHistory,SuicideThoughtHistory,SubstanceAbuseProblem,MentalHealthProblem,...,RecentCriminalLegalProblem,SuicideNote,SuicideIntentDisclosed,DisclosedToIntimatePartner,DisclosedToOtherFamilyMember,DisclosedToFriend,InjuryLocationType,WeaponType1,text_length,fold
1033,bjnj,Victim is a XX XX. Victim was diagnosed with b...,"Victim1, a XX XX XX was found hanging in a loa...",0,0,0,0,0,1,1,...,0,0,0,0,0,0,6,6,386,1.0
1961,cqmf,Officers were dispatched to a residence regard...,The victim was a XX XX who died from a self-in...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,5,388,1.0


In [27]:
# Go back to the original order

# Set the "uid" column as the index
df_data = df_data.set_index('uid')

# Go back to the orginal row order
df_data = df_data.reindex(orig_order)

# Reset the index while keeping the "uid" column
df_data = df_data.reset_index(drop=False)

#df_data.head(3)

In [28]:
#orig_order

## Inference helper functions

In [29]:
df.head()

Unnamed: 0,uid,NarrativeLE,NarrativeCME,DepressedMood,MentalIllnessTreatmentCurrnt,HistoryMentalIllnessTreatmnt,SuicideAttemptHistory,SuicideThoughtHistory,SubstanceAbuseProblem,MentalHealthProblem,...,SchoolProblem,RecentCriminalLegalProblem,SuicideNote,SuicideIntentDisclosed,DisclosedToIntimatePartner,DisclosedToOtherFamilyMember,DisclosedToFriend,InjuryLocationType,WeaponType1,text_length
0,bjnj,Victim is a XX XX. Victim was diagnosed with b...,"Victim1, a XX XX XX was found hanging in a loa...",0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,6,6,386


In [30]:
def tokenize_batch(df):
    """
    Inputs:
    df - A batch of prompts in a pandas dataframe
    
    Outputs:
    inputs - a a dict containing the tokenized batch
    """
    
    prompt_list = list(df['prompts'])
    
    # Get the num CPU cores available
    num_workers = os.cpu_count()
    
    # Tokenize the prompts
    inputs = tokenizer(prompt_list, 
                       return_tensors="pt",
                        #padding=True, 
                       #num_workers=num_workers
                      )
    
    return inputs

In [31]:
def run_slm_with_batches(inputs):

    # Send the inputs to the device
    inputs = inputs.to(DEVICE)

    # Generate the outputs from prompt
    generate_ids = model.generate(**inputs, 
                                  max_new_tokens=512,
                                  do_sample=False,
                                  temperature=0.1
                                 )
    
    # Decode the generated output
    generated_text_list = tokenizer.batch_decode(generate_ids, 
                                        skip_special_tokens=False,
                                        clean_up_tokenization_spaces=False)
    
    
    # Extract the answer
    # -------------------
    
    response_list = []
    
    for i in range(0, len(generated_text_list)):
        
        response = generated_text_list[i]
        
        # Extract the answer
        # Split
        response = response.split('<|assistant|>')[1]

        # Remove leading and trailing spaces
        response = response.strip()
        
        response_list.append(response)
        
    # Create a new column containing the raw responses   
    df['raw_responses'] = response_list
        
    return df
    

In [32]:
# Post process the responses

def post_process_responses(df):
    
    master_key_list = [
    "DepressedMood",
    "MentalIllnessTreatmentCurrnt", 
    "HistoryMentalIllnessTreatmnt",
    "SuicideAttemptHistory", 
    "SuicideThoughtHistory",
    "SubstanceAbuseProblem", 
    "MentalHealthProblem", 
    "DiagnosisAnxiety",
    "DiagnosisDepressionDysthymia", 
    "DiagnosisBipolar", 
    "DiagnosisAdhd",
    "IntimatePartnerProblem", 
    "FamilyRelationship", 
    "Argument",
    "SchoolProblem", 
    "RecentCriminalLegalProblem", 
    "SuicideNote",
    "SuicideIntentDisclosed", 
    "DisclosedToIntimatePartner",
    "DisclosedToOtherFamilyMember", 
    "DisclosedToFriend",
    "InjuryLocationType",
    "WeaponType1"
    ]

    
    uid_list = []

    DepressedMood_list = []
    MentalIllnessTreatmentCurrnt_list = []
    HistoryMentalIllnessTreatmnt_list = []
    SuicideAttemptHistory_list = []
    SuicideThoughtHistory_list = []
    SubstanceAbuseProblem_list = []
    MentalHealthProblem_list = []

    DiagnosisAnxiety_list = []
    DiagnosisDepressionDysthymia_list = []
    DiagnosisBipolar_list = []
    DiagnosisAdhd_list = []

    IntimatePartnerProblem_list = []
    FamilyRelationship_list = []
    Argument_list = []
    SchoolProblem_list = []
    RecentCriminalLegalProblem_list = []

    SuicideNote_list = []
    SuicideIntentDisclosed_list = []
    DisclosedToIntimatePartner_list = []
    DisclosedToOtherFamilyMember_list = []
    DisclosedToFriend_list = []

    InjuryLocationType_list = []
    WeaponType1_list = []

    
    for i in range(0, len(df)):
        
        try:
            uid = df.loc[i,'uid']

            response = df.loc[i, 'raw_responses']
            #response = data_dict['response']
            
            

            # Remove the end token
            response = response.replace('<|end|><|endoftext|>', "")
            response = response.replace('<|end|>', "")
            response = response.replace('```json', "")
            response = response.replace('```', "")


            # The model can output text in addition to JSON.
            # Use regular expression to extract JSON part from the response.
            # Regular expression to match one set of curly braces and everything inside
            pattern = r'\{[^}]*\}'

            # Find and extract the first match
            match = re.search(pattern, response)

            if match:
                response = match.group()  # Output: {extract this}

            response_json = json.loads(response)

            # Check if all keys are in the response an
            # that they can be accessed with an error.
            # This steps ensures that all arrays have
            # the same length
            check_list = []
            pred_json_key_list = response_json.keys()
            for item in master_key_list:
                if item not in pred_json_key_list:
                    check_list.append(0)


            if len(check_list) == 0:

                uid_list.append(uid)

                DepressedMood_list.append(response_json['DepressedMood'])
                MentalIllnessTreatmentCurrnt_list.append(response_json['MentalIllnessTreatmentCurrnt'])
                HistoryMentalIllnessTreatmnt_list.append(response_json['HistoryMentalIllnessTreatmnt'])
                SuicideAttemptHistory_list.append(response_json['SuicideAttemptHistory'])
                SuicideThoughtHistory_list.append(response_json['SuicideThoughtHistory'])
                SubstanceAbuseProblem_list.append(response_json['SubstanceAbuseProblem'])
                MentalHealthProblem_list.append(response_json['MentalHealthProblem'])

                DiagnosisAnxiety_list.append(response_json['DiagnosisAnxiety'])
                DiagnosisDepressionDysthymia_list.append(response_json['DiagnosisDepressionDysthymia'])
                DiagnosisBipolar_list.append(response_json['DiagnosisBipolar'])
                DiagnosisAdhd_list.append(response_json['DiagnosisAdhd'])

                IntimatePartnerProblem_list.append(response_json['IntimatePartnerProblem'])
                FamilyRelationship_list.append(response_json['FamilyRelationship'])
                Argument_list.append(response_json['Argument'])
                SchoolProblem_list.append(response_json['SchoolProblem'])
                RecentCriminalLegalProblem_list.append(response_json['RecentCriminalLegalProblem'])

                SuicideNote_list.append(response_json['SuicideNote'])
                SuicideIntentDisclosed_list.append(response_json['SuicideIntentDisclosed'])
                DisclosedToIntimatePartner_list.append(response_json['DisclosedToIntimatePartner'])
                DisclosedToOtherFamilyMember_list.append(response_json['DisclosedToOtherFamilyMember'])
                DisclosedToFriend_list.append(response_json['DisclosedToFriend'])

                InjuryLocationType_list.append(response_json['InjuryLocationType'])
                WeaponType1_list.append(response_json['WeaponType1'])


            else:
                print("There as an issue with JSON keys.")
                print("Predicting all ones...")
                print(response)
                
                uid_list.append(uid)

                DepressedMood_list.append('yes')
                MentalIllnessTreatmentCurrnt_list.append('yes')
                HistoryMentalIllnessTreatmnt_list.append('yes')
                SuicideAttemptHistory_list.append('yes')
                SuicideThoughtHistory_list.append('yes')
                SubstanceAbuseProblem_list.append('yes')
                MentalHealthProblem_list.append('yes')

                DiagnosisAnxiety_list.append('yes')
                DiagnosisDepressionDysthymia_list.append('yes')
                DiagnosisBipolar_list.append('yes')
                DiagnosisAdhd_list.append('yes')

                IntimatePartnerProblem_list.append('yes')
                FamilyRelationship_list.append('yes')
                Argument_list.append('yes')
                SchoolProblem_list.append('yes')
                RecentCriminalLegalProblem_list.append('yes')

                SuicideNote_list.append('yes')
                SuicideIntentDisclosed_list.append('yes')
                DisclosedToIntimatePartner_list.append('yes')
                DisclosedToOtherFamilyMember_list.append('yes')
                DisclosedToFriend_list.append('yes')

                InjuryLocationType_list.append(1)
                WeaponType1_list.append(5)

        
        except Exception as e:
            
            print('--Exception error--')
            print(e)

            print(f"uid: {uid}")
            print('Possible json error...')
            print('Predicting all ones')

            print(response)

            uid_list.append(uid)

            DepressedMood_list.append('yes')
            MentalIllnessTreatmentCurrnt_list.append('yes')
            HistoryMentalIllnessTreatmnt_list.append('yes')
            SuicideAttemptHistory_list.append('yes')
            SuicideThoughtHistory_list.append('yes')
            SubstanceAbuseProblem_list.append('yes')
            MentalHealthProblem_list.append('yes')

            DiagnosisAnxiety_list.append('yes')
            DiagnosisDepressionDysthymia_list.append('yes')
            DiagnosisBipolar_list.append('yes')
            DiagnosisAdhd_list.append('yes')

            IntimatePartnerProblem_list.append('yes')
            FamilyRelationship_list.append('yes')
            Argument_list.append('yes')
            SchoolProblem_list.append('yes')
            RecentCriminalLegalProblem_list.append('yes')

            SuicideNote_list.append('yes')
            SuicideIntentDisclosed_list.append('yes')
            DisclosedToIntimatePartner_list.append('yes')
            DisclosedToOtherFamilyMember_list.append('yes')
            DisclosedToFriend_list.append('yes')

            InjuryLocationType_list.append(1)
            WeaponType1_list.append(5)


    data = {
            "uid": uid_list,

            "DepressedMood": DepressedMood_list,
            "MentalIllnessTreatmentCurrnt": MentalIllnessTreatmentCurrnt_list,
            "HistoryMentalIllnessTreatmnt": HistoryMentalIllnessTreatmnt_list,
            "SuicideAttemptHistory": SuicideAttemptHistory_list,
            "SuicideThoughtHistory": SuicideThoughtHistory_list,
            "SubstanceAbuseProblem": SubstanceAbuseProblem_list,
            "MentalHealthProblem": MentalHealthProblem_list,

            "DiagnosisAnxiety": DiagnosisAnxiety_list,
            "DiagnosisDepressionDysthymia": DiagnosisDepressionDysthymia_list,
            "DiagnosisBipolar": DiagnosisBipolar_list,
            "DiagnosisAdhd": DiagnosisAdhd_list,

            "IntimatePartnerProblem": IntimatePartnerProblem_list,
            "FamilyRelationship": FamilyRelationship_list,
            "Argument": Argument_list,
            "SchoolProblem": SchoolProblem_list,
            "RecentCriminalLegalProblem": RecentCriminalLegalProblem_list,

            "SuicideNote": SuicideNote_list,
            "SuicideIntentDisclosed": SuicideIntentDisclosed_list,
            "DisclosedToIntimatePartner": DisclosedToIntimatePartner_list,
            "DisclosedToOtherFamilyMember": DisclosedToOtherFamilyMember_list,
            "DisclosedToFriend": DisclosedToFriend_list,

            "InjuryLocationType": InjuryLocationType_list,
            "WeaponType1": WeaponType1_list 
            }

    df_preds = pd.DataFrame(data)
    
    return df_preds
    
    

In [33]:
def tokenize_prompts(df):
    
    """
    Input: Pandas dataframe with a column named prompts.
    Output: Pandas dataframe with the input_ids and att_mask
    for each prompt.
    
    """
    
    # Convert pandas DataFrame to Huggingface Dataset
    dataset = Dataset.from_pandas(df)
    
    # Get the num CPU cores available
    num_workers = os.cpu_count()

    # Tokenize using the Huggingface dataset `map` function, without truncation
    def tokenize_function(examples):
        
        return tokenizer(examples['prompts'], 
                         padding=False,  # Add padding
                         truncation=False)  # Disable truncation
    
    # Apply tokenization across the dataset in parallel
    tokenized_dataset = dataset.map(tokenize_function, 
                                    batched=True, 
                                    num_proc=num_workers)
    
    # Convert Hugging Face Dataset to pandas DataFrame
    df = tokenized_dataset.to_pandas()
    
    return df

## Create the system message

In [34]:
system_message = f"""
You are an expert legal assistant.

You will be provided with a narrative about a person who has committed suicide. 
Your task is to answer questions about the content of the narrative. You must be able to support your
answers with facts from the narrative.

<categories>

### Answer only "yes" or "no":

1. DepressedMood: Was the victim was perceived to be depressed at the time?
2. MentalIllnessTreatmentCurrnt: Was the victim currently undergoing medical treatment for a mental health or substance abuse problem?
3. HistoryMentalIllnessTreatmnt: Did the victim have a history of ever being treated for a mental health or substance abuse problem?
4. SuicideAttemptHistory: Did the victim have a history of attempting suicide?
5. SuicideThoughtHistory: Did the victim have a history of suicidal thoughts or plans?
6. SubstanceAbuseProblem: Did the victim struggle with a substance abuse problem?
7. MentalHealthProblem: Did the victim have a mental health condition at the time?

8. DiagnosisAnxiety: Was the victim ever diagnosed with Anxiety?
9. DiagnosisDepressionDysthymia: Was the victim ever diagnosed with Dysthymia?
10. DiagnosisBipolar: Was the victim ever diagnosed as being Bipolar?
11. DiagnosisAdhd: Was the victim ever diagnosed with Adhd?

12. IntimatePartnerProblem: Did problems with a current or former intimate partner appear to have contributed to the victim's suicide?
13. FamilyRelationship: Did relationship problems with a family member (other than an intimate partner) appear to have contributed to the victim's suicide? 
14. Argument: Did an argument or conflict appear to have contributed to the victim's suicide?
15. SchoolProblem: Did problems at or related to school appear to have contributed to the victim's suicide? 
16. RecentCriminalLegalProblem: Did criminal legal problem(s) appear to have contributed to the victim's suicide?

17. SuicideNote: Did the victim leave a suicide note?
18. SuicideIntentDisclosed: Did the victim disclose their thoughts and/or plans to die by suicide to someone else within the last month? 
19. DisclosedToIntimatePartner: Did the victim disclose their intent to commit suicide to a previous or current intimate partner?
20. DisclosedToOtherFamilyMember: Did the victim disclose their intent to commit suicide to another family member?
21. DisclosedToFriend:  Did the victim disclose their intent to commit suicide to a friend?

### Answer by returning an integer:

22. InjuryLocationType: Where did the suicide take place?
    (Choose only one option and return only the number e.g. 1)
    1. House/apartment,
    2. Motor vehicle (excluding school bus and public transportation)
    3. Natural area (e.g., field, river, beaches, woods)
    4. Park, playground, public use area
    5. Street/road, sidewalk, alley
    6. Other
23. WeaponType1: What type of weapon/means did the victim use to commit suicide?
    (Choose only one option and return only the number e.g. 1)
    1. Blunt instrument
    2. Drowning
    3. Fall
    4. Fire or burns
    5. Firearm
    6. Hanging, strangulation, suffocation
    7. Motor vehicle including buses, motorcycles
    8. Other transport vehicle, eg, trains, planes, boats
    9. Poisoning
    10. Sharp instrument
    11. Other (e.g. taser, electrocution, nail gun)
    12. Unknown
</categories>
 
Format your response as JSON, without any prefixes, with the following 23 keys:
'DepressedMood','MentalIllnessTreatmentCurrnt', 'HistoryMentalIllnessTreatmnt',
'SuicideAttemptHistory', 'SuicideThoughtHistory',
'SubstanceAbuseProblem', 'MentalHealthProblem', 'DiagnosisAnxiety',
'DiagnosisDepressionDysthymia', 'DiagnosisBipolar', 'DiagnosisAdhd',
'IntimatePartnerProblem', 'FamilyRelationship', 'Argument',
'SchoolProblem', 'RecentCriminalLegalProblem', 'SuicideNote',
'SuicideIntentDisclosed', 'DisclosedToIntimatePartner',
'DisclosedToOtherFamilyMember', 'DisclosedToFriend',
'InjuryLocationType', 'WeaponType1'

### Example output:
{{
"DepressedMood": "<your-answer>",
"MentalIllnessTreatmentCurrnt": "<your-answer>", 
"HistoryMentalIllnessTreatmnt": "<your-answer>",
"SuicideAttemptHistory": "<your-answer>", 
"SuicideThoughtHistory": "<your-answer>",
"SubstanceAbuseProblem": "<your-answer>", 
"MentalHealthProblem": "<your-answer>", 
"DiagnosisAnxiety": "<your-answer>",
"DiagnosisDepressionDysthymia": "<your-answer>", 
"DiagnosisBipolar": "<your-answer>", 
"DiagnosisAdhd": "<your-answer>",
"IntimatePartnerProblem": "<your-answer>", 
"FamilyRelationship": "<your-answer>", 
"Argument": "<your-answer>",
"SchoolProblem": "<your-answer>", 
"RecentCriminalLegalProblem": "<your-answer>", 
"SuicideNote": "<your-answer>",
"SuicideIntentDisclosed": "<your-answer>", 
"DisclosedToIntimatePartner": "<your-answer>",
"DisclosedToOtherFamilyMember": "<your-answer>", 
"DisclosedToFriend": "<your-answer>",
"InjuryLocationType": "<integer>",
"WeaponType1": "<integer>"
}}
"""

## Create the prompts

In [35]:
def create_prompts(row):
    
    NarrativeLE = row['NarrativeLE']
    NarrativeCME = row['NarrativeCME']

    user_query = f"Law enforcement report: {NarrativeLE}\nMedical examiner report: {NarrativeCME}\nFormat your response as JSON, without any prefixes."

    # Create the prompt template
    prompt = f"<|system|>{system_message}<|end|><|user|>{user_query}<|end|><|assistant|>"
    
    return prompt

df_data['prompts'] = df_data.apply(create_prompts, axis=1)

#df_data.head(3)

In [36]:
#df_data.loc[0, 'prompts']

## Tokenize all the prompts

We will use a hf dataset instead of pandas to take advantage of multi-processing.

In [37]:
# Start timing
start_time = time.time()


# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

# Tokenize all the prompts, without adding padding.
df_data = tokenize_prompts(df_data)

print(df_data.shape)

# Get the inference time
elapsed_time = timer(start_time)
elapsed_time = elapsed_time/60
print(f"Time taken: {elapsed_time} minutes")

df_data.head(2)

Map (num_proc=4):   0%|          | 0/3900 [00:00<?, ? examples/s]

(3900, 31)
Time taken: 0.16833333333333333 minutes


Unnamed: 0,uid,NarrativeLE,NarrativeCME,DepressedMood,MentalIllnessTreatmentCurrnt,HistoryMentalIllnessTreatmnt,SuicideAttemptHistory,SuicideThoughtHistory,SubstanceAbuseProblem,MentalHealthProblem,...,DisclosedToIntimatePartner,DisclosedToOtherFamilyMember,DisclosedToFriend,InjuryLocationType,WeaponType1,text_length,fold,prompts,input_ids,attention_mask
0,aaaf,V (XX XX) shot himself in a motor vehicle.The ...,V (XX XX) shot himself in a motor vehicle.The ...,0,0,0,0,1,0,0,...,0,1,0,2,5,628,3.0,<|system|>\nYou are an expert legal assistant....,"[128000, 27, 91, 9125, 91, 397, 2675, 527, 459...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,aaby,V was XXXX. V was found in the basement of his...,V was XXXX. V was found in the basement of hi...,0,0,0,0,0,0,0,...,0,0,0,1,6,778,0.0,<|system|>\nYou are an expert legal assistant....,"[128000, 27, 91, 9125, 91, 397, 2675, 527, 459...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [38]:
def create_tokenized_batch_fast(df_tok):
    
    """
    Input:
    A batch as a pandas dataframe containing input_ids 
    and attention_mask for each prompt. 
    These are lists and they are not padded.
    
    Output:
    A tokenized batch that's ready for batch inference.
    """

    # 32000
    pad_token = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)

    # Create a column with the length of each list
    def get_lengths(x):
        length = len(x)
        return length

    df_tok['length'] = df_tok['input_ids'].apply(get_lengths)

    # Get the length of the longest prompt in the batch
    max_len = df_tok['length'].max()
    

    # Pad the input_ids with the pad token
    def pad_input_ids(x):

        pad_token = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)
        padded_list = np.pad(x, (0, max_len - len(x)), 
                              mode='constant', constant_values=pad_token)

        return padded_list

    df_tok['padded_input_ids'] = df_tok['input_ids'].apply(pad_input_ids)


    # Pad the attention_masks with 0
    def pad_attention_mask(x):

        padded_list = np.pad(x, (0, max_len - len(x)), 
                              mode='constant', constant_values=0)

        return padded_list

    df_tok['padded_attention_mask'] = df_tok['attention_mask'].apply(pad_input_ids)


    tokenized_batch = {
        "input_ids": torch.tensor(list(df_tok['padded_input_ids'])),
        "attention_mask": torch.tensor(list(df_tok['padded_attention_mask']))
    }

    return tokenized_batch

In [39]:
# Example of tokeninizing one batch

i = 0

# Reorder by text length
df_sorted = df_data.sort_values(by='text_length', ascending=True)
df_sorted = df_sorted.reset_index(drop=True)

# Assign a batch number to every row
batch_size = 5
df_sorted = assign_batch_numbers(df_sorted, batch_size)

# Get the batch
df = df_sorted[df_sorted['batch_number'] == i]
df = df.reset_index(drop=True)

print(df.shape)

df.head(1)

(5, 32)


Unnamed: 0,uid,NarrativeLE,NarrativeCME,DepressedMood,MentalIllnessTreatmentCurrnt,HistoryMentalIllnessTreatmnt,SuicideAttemptHistory,SuicideThoughtHistory,SubstanceAbuseProblem,MentalHealthProblem,...,DisclosedToOtherFamilyMember,DisclosedToFriend,InjuryLocationType,WeaponType1,text_length,fold,prompts,input_ids,attention_mask,batch_number
0,bjnj,Victim is a XX XX. Victim was diagnosed with b...,"Victim1, a XX XX XX was found hanging in a loa...",0,0,0,0,0,1,1,...,0,0,6,6,386,1.0,<|system|>\nYou are an expert legal assistant....,"[128000, 27, 91, 9125, 91, 397, 2675, 527, 459...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0


In [40]:
tokenized_batch = create_tokenized_batch_fast(df)

tokenized_batch

{'input_ids': tensor([[128000,     27,     91,  ..., 128009, 128009, 128009],
         [128000,     27,     91,  ..., 128009, 128009, 128009],
         [128000,     27,     91,  ...,  78191,     91,     29],
         [128000,     27,     91,  ...,     91,     29, 128009],
         [128000,     27,     91,  ..., 128009, 128009, 128009]],
        dtype=torch.int32),
 'attention_mask': tensor([[1, 1, 1,  ..., 9, 9, 9],
         [1, 1, 1,  ..., 9, 9, 9],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 9],
         [1, 1, 1,  ..., 9, 9, 9]], dtype=torch.int8)}

## Run Inference

In [41]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Initialize the model and tokenizer

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map=DEVICE,
    torch_dtype=torch.bfloat16,
    #trust_remote_code=True
    local_files_only=True,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [44]:
# Start timing
start_time = time.time()


#--------------------------
# Select the fold
#--------------------------


# Select the fold
df_data = df_data[df_data['fold'] == TEST_FOLD]
df_data = df_data.reset_index(drop=True)

if RUN_TEST == True:
    #################
    # For testing only

    num_samples = NUM_SAMPLES

    df_data = df_data[0:num_samples]
    df_data = df_data.reset_index(drop=True)

    ################
    
    
#--------------------------
# Create batches
#--------------------------
    
# Save the original order
orig_order = list(df_data['uid'])

# Reorder by text length
df_sorted = df_data.sort_values(by='text_length', ascending=True)
df_sorted = df_sorted.reset_index(drop=True)

# Assign a batch number to every row
batch_size = BATCH_SIZE
df_sorted = assign_batch_numbers(df_sorted, batch_size)
    
# Get the number of batches by getting
# the number of unique batch numbers.
batch_num_list = list(df_sorted['batch_number'].unique())
num_batches = len(batch_num_list)


#--------------------------
# Tokenize each batch
#--------------------------


tokenized_batch_list = []

# Tokenizing is done on the CPU.
# Tokening batches outside the main inference loop
# sppeds up inference because the GPU does not need to
# wait tokenized batches from the CPU.

# Each batch has a different padded length.
# Doing this one batch at time means batches
# wth shorter lengths will be processed faster.
print('Tokenizing all the batches...')
for i in tqdm(range(0, num_batches)):
    
    # Get the batch
    df = df_sorted[df_sorted['batch_number'] == i]
    df = df.reset_index(drop=True)
    
    #tokenized_batch = tokenize_batch(df)
    tokenized_batch = create_tokenized_batch_fast(df)

    tokenized_batch_list.append(tokenized_batch)
    

#--------------------------
# Run inference
#--------------------------


print('Running inference on each batch...')
for i in tqdm(range(0, num_batches)):
    
    inputs = tokenized_batch_list[i]
    
    # Convert the dictionary to a BatchEncoding object
    inputs = BatchEncoding(inputs)
    
    inputs = inputs.to(DEVICE)
        
        
    df_response = run_slm_with_batches(inputs)
    
    if i == 0:
        df_fin = df_response
    else:
        df_fin = pd.concat([df_fin, df_response])
        
    torch.cuda.empty_cache()
    gc.collect()
    gc.collect()
    
        

# Get the inference time
elapsed_time = timer(start_time)
elapsed_time = elapsed_time/60
print(f"Time taken: {elapsed_time} minutes")

Tokenizing all the batches...


100%|██████████| 50/50 [00:00<00:00, 226.68it/s]


Running inference on each batch...


  0%|          | 0/50 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 1/50 [02:21<1:55:51, 141.87s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|▍         | 2/50 [03:25<1:16:47, 95.99s/it] Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|▍         | 2/50 [04:02<1:37:00, 121.25s/it]


KeyboardInterrupt: 

In [None]:
print(df_fin.shape)

#df_fin.head()

In [None]:
prompt = "Hello. How are you"
inputs = tokenizer(prompt, return_tensors="pt")

type(inputs)

## Post process the raw responses

In [None]:
# Create a labels dataframe

cols_to_drop = ["NarrativeLE", 
                "NarrativeCME", 
                "text_length", 
                "fold", "prompts"]

df_labels = df_data.drop(cols_to_drop, axis=1)

df_labels = df_labels.set_index('uid')

df_labels.head()

In [None]:
# Chane the row order back to the original order
# that existed before sorting by text length.

# Set the "uid" column as the index
df_fin = df_fin.set_index('uid')

# Go back to the orginal row order
df_fin1 = df_fin.reindex(orig_order)

# Reset the index while keeping the "uid" column
df_fin1 = df_fin1.reset_index(drop=False)

#df_fin1.head()

In [None]:
df_labels.head()

In [None]:
# Post process the response
df_preds = post_process_responses(df_fin1)


# Save the raw preds for analysis later
path = "df_raw_preds.csv"
df_preds.to_csv(path, index=False)


# Change the values from yes or no
# to 0 or 1

# Define the mapping function
def yes_no_to_binary(x):
    if type(x) == str:
        x = x.lower()
        if x == 'yes':
            return 1
        elif x == 'no':
            return 0
        else:
            return 0 # In case the model outputs "unknown" or "none"
    else:
        return x

    
df_preds = df_preds.set_index('uid')

# Apply the function to the entire dataframe
df_preds = df_preds.applymap(yes_no_to_binary)

# Make sure all values are int and not NaN
df_preds = df_preds.round().astype(int)

# Check that there are 23 target variables
assert df_preds.shape[1] == 23

# Check that column order and row order are the same
assert (df_preds.index == df_labels.index).all()

# All values should be integers
assert (df_preds.dtypes == int).all()

df_preds.head()

## Evaluate the predictions

In [None]:
# [1] Get the baseline accuracy aand f1 score if
# the model predicted all zeros for
# the binary features and predicted the majority class for
# 'InjuryLocationType' (1) and 'WeaponType1' (5)

# Convert to numpy
np_labels = df_labels.to_numpy()
np_preds = df_preds.to_numpy()


# Create a matrix with all values 0
np_preds_baseline = np_preds * 0

"""
# Biased probabilities

num_rows = df_labels.shape[0]
num_cols = df_labels.shape[1]

# Define the probabilities for 0 and 1
probabilities = [0.2, 0.8]  # 80% chance of 0, 20% chance of 1

# Create the array with biased probabilities
np_preds_baseline = np.random.choice([0, 1], 
                                     size=(num_rows, num_cols), 
                                     p=probabilities)
"""

# Set the second last column to 1 (InjuryLocationType)
np_preds_baseline[:, -2] = 1
# Set the last column to 5 (WeaponType1)
np_preds_baseline[:, -1] = 5

# Get the total number of entries in the dataframe
total = np_labels.size

# Element-wise comparison
matches = np_labels == np_preds_baseline

# Count number of matches
num_matches = np.sum(matches)

# Accuracy
acc = num_matches/total

# Calculate f1 score
excluded_list = ['InjuryLocationType', 'WeaponType1']

# Get a list of columns
col_list = list(df_preds.columns)

# Make a copy of df_preds
df_preds_zeros = df_preds.copy()

# Set column values to 0
# Remember the 'uid' column is the index
for item in col_list:
    if item not in excluded_list:
        # Set all values to 0
        #df_preds_zeros[item] = 0
        
        # The biased_coin function returns either 0 or 1.
        # p_zero is the probability that the function will return 0.
        df_preds_zeros[item] = biased_coin(p_zero=0.0)

# Set values to the majority class
df_preds_zeros['InjuryLocationType'] = 1
df_preds_zeros['WeaponType1'] = 5

f1_score_avg = calculate_f1(df_preds_zeros, df_labels)

print(f"F1 score (pred ones): {f1_score_avg}")
print(f"Accuracy (pred zeros): {acc}")
print(f"Total: {total}")
print(f"Num matches: {num_matches}")

In [None]:
# [2] Get the model pred accuracy and f1 score

# Convert to numpy
np_labels = df_labels.to_numpy()
np_preds_model = df_preds.to_numpy()

# Get the total number of entries in the dataframe
total = np_labels.size

# Element-wise comparison
matches = np_labels == np_preds_model

# Count number of matches
num_matches = np.sum(matches)


# Accuracy
acc = num_matches/total

f1_score_avg = calculate_f1(df_preds, df_labels)

print(f"F1 score: {f1_score_avg}")
print(f"Accuracy: {acc}")
print(f"Total: {total}")
print(f"Num matches: {num_matches}")

## Save the preds

In [None]:
# Save the dataframes for analysis later

path = "df_preds.csv"
df_preds.to_csv(path, index=False)

!ls

## Check if df_preds meets the submission requirements

In [None]:
path = base_path + 'submission_format_OfwLSFE.csv'
df_sample = pd.read_csv(path)

df_sample = df_sample[0:NUM_SAMPLES]

df_sample = df_sample.set_index('uid')

print(df_sample.shape)

df_sample.head()

In [None]:
df_preds.index = df_sample.index

df_preds.head()

In [None]:
## Ensure the range of the predicted values is correct

# predictions["InjuryLocationType"] = 1 (1 to 6)
# predictions["WeaponType1"] = 5 (1 to 12)

def check_col_InjuryLocationType(x):
    
    if x < 1:
        return 1
    elif x > 6:
        return 1
    else:
        return x
    
df_preds['InjuryLocationType'] = df_preds['InjuryLocationType'].apply(check_col_InjuryLocationType)
    

In [None]:
# predictions["InjuryLocationType"] = 1 (1 to 6)
# predictions["WeaponType1"] = 5 (1 to 12)

def check_col_WeaponType1(x):
    
    if x < 1:
        return 5
    elif x > 12:
        return 5
    else:
        return x
    
df_preds['WeaponType1'] = df_preds['WeaponType1'].apply(check_col_WeaponType1)
 

In [None]:
# Check that the binary features only
# have values that are either 1 or 0

def check_binary_col(x):
    
    answer_list = [0, 1]
    
    if x not in answer_list:
        return 0
    else:
        return x
    
    
col_list = ['DepressedMood',
       'MentalIllnessTreatmentCurrnt', 'HistoryMentalIllnessTreatmnt',
       'SuicideAttemptHistory', 'SuicideThoughtHistory',
       'SubstanceAbuseProblem', 'MentalHealthProblem', 'DiagnosisAnxiety',
       'DiagnosisDepressionDysthymia', 'DiagnosisBipolar', 'DiagnosisAdhd',
       'IntimatePartnerProblem', 'FamilyRelationship', 'Argument',
       'SchoolProblem', 'RecentCriminalLegalProblem', 'SuicideNote',
       'SuicideIntentDisclosed', 'DisclosedToIntimatePartner',
       'DisclosedToOtherFamilyMember', 'DisclosedToFriend']

# Check every binary column
for col in col_list:
    
    df_preds[col] = df_preds[col].apply(check_binary_col)
    
df_preds.head()

In [None]:
# Check that there are 23 target variables
assert df_preds.shape[1] == 23

# Check that column order and row order are the same
assert (df_preds.index == df_sample.index).all()

# All values should be integers
assert (df_preds.dtypes == int).all()

In [None]:
# Columns are in the correct order
assert (df_sample.columns == df_preds.columns).all().all()

# All columns are of type int
assert (df_preds.dtypes == int).all()

In [None]:
# Variables have values within the expected range
assert df_preds.iloc[:, 0:-2].isin([0, 1]).all().all()
assert (df_preds["InjuryLocationType"].isin(range(1, 7))).all()
assert (df_preds["WeaponType1"].isin(range(1, 13))).all()

In [None]:
# Create a requirements.txt file

!pip freeze > requirements.txt