# LLM Classifiers

**Goal:** Given a sentence as input, classify it as either a prediction or non-prediction.

In [1]:
import os
import sys

import pandas as pd

from tqdm import tqdm

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

# import log_files
from data_processing import DataProcessing
from classification_models import EvaluationMetric
from text_generation_models import TextGenerationModelFactory

In [2]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Load Data

In [3]:
print("======= LOAD DATA =======")



In [4]:
base_data_path = os.path.join(notebook_dir, '../data/')
combine_data_path = os.path.join(base_data_path, 'combined_generated_fin_phrase_bank')
X_test_set_path = os.path.join(combine_data_path, 'x_test_set-v1.csv')
y_test_set_path = os.path.join(combine_data_path, 'y_test_set-v1.csv')

In [5]:
X_test_df = DataProcessing.load_from_file(X_test_set_path, 'csv')
X_test_df.drop(columns=['Unnamed: 0'], inplace=True)
X_test_df.head(7)

Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings
0,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .,1,[-1.22568220e-01 2.48624131e-01 -4.82655168e-02 -1.09789923e-01\n -8.58348906e-02 3.63291465e-02 -1.27964811e-02 7.17335939e-02\n -1.18364163e-01 2.27930522e+00 -3.32476139e-01 -1.46164745e-02\n 1.25499114e-01 7.25146085e-02 6.57393187e-02 -5.76901138e-02\n -2.05907249e-03 1.51917112e+00 -3.29480022e-01 -4.57408801e-02\n 2.22120117e-02 1.05339624e-01 -4.22692373e-02 -1.23566784e-01\n -5.79119613e-03 6.18153140e-02 -5.48806489e-02 2.96797305e-02\n -1.07008472e-01 -9.98812243e-02 -9.39770564e-02 3.87728140e-02\n -5.82127571e-02 7.14930072e-02 9.65250358e-02 -3.75642404e-02\n 7.18023349e-03 6.21852428e-02 7.68979266e-02 -1.69069305e-01\n 1.44475931e-03 7.71993026e-02 3.41709815e-02 -1.10992551e-01\n -1.81387588e-02 -8.61631632e-02 -2.34903637e-02 6.00658916e-02\n -3....,[-0.14791287 -0.12622888 -0.9597454 -1.5068511 -1.6452323 1.2114074\n -0.1530719 0.91836745 -2.1379716 1.7602677 -0.42004758 -0.5548834\n 1.1031741 1.2786088 1.1086601 -0.413204 0.30347356 2.9280567\n -3.59768 -0.35298532 -0.12096298 0.67678666 -0.6748305 -0.4839827\n -0.71752447 -0.10067806 1.1003746 0.07561237 -2.1959317 -2.6175833\n -1.565941 0.42524984 -0.871213 -0.02143001 0.83343935 -0.2905648\n 0.4480813 0.1682285 1.4640632 -2.2469096 -0.02232814 0.45951518\n -0.6981451 -1.4169854 -0.70628285 -1.45749 0.5126216 1.2621828\n -1.1238234 2.1485727 2.5964377 -1.829683 1.5837685 -0.04823298\n 1.5030391 0.06060527 -0.7219292 -2.4253752 -1.7692105 -0.9184389\n -0.16441473 -0.5413762 -0.07175377 1.7040485 1.2071314 -0....
1,"The American Heart Association predicts on November 1, 2029, the obesity rates at the national level may rise.",1,[-1.44551620e-01 3.84713501e-01 4.10299003e-02 -1.51987616e-02\n -1.00560952e-02 -6.73255771e-02 1.77811161e-02 4.57233824e-02\n 8.39890912e-02 2.01573181e+00 -3.67580205e-01 -5.70408031e-02\n 1.22891583e-01 6.99080601e-02 1.97983291e-02 -5.66798002e-02\n 4.76624072e-03 1.21926570e+00 -1.45055830e-01 -3.63234729e-02\n 1.35041941e-02 -3.28723826e-02 -2.80004255e-02 -1.84945613e-01\n 3.26813050e-02 8.54878053e-02 -1.49334148e-01 4.50391369e-03\n 3.68039422e-02 1.91185996e-01 2.18118466e-02 1.32588789e-01\n 8.98898989e-02 1.37999743e-01 1.21656165e-01 6.33542910e-02\n -1.06478065e-01 8.26832876e-02 -4.30808812e-02 1.46262258e-01\n -2.19029468e-02 4.71463166e-02 -2.72156689e-02 -2.27241516e-02\n 8.14019889e-02 -8.10244754e-02 -7.50189573e-02 -1.04098104e-01\n 6....,[-0.41808617 1.70707 0.48293787 0.04548473 -0.63167155 -0.43579143\n 0.42126647 0.5806888 0.5016804 0.8586611 -0.9072807 -1.092698\n 1.0650555 1.2340899 0.6353464 -0.39396188 0.42907965 0.8229404\n -0.52147204 -0.19341081 -0.3010331 -1.3742509 -0.43909413 -1.3607903\n -0.12759547 0.21505134 -0.35859266 -0.35160506 0.23767695 1.5916128\n 0.55531317 1.9194982 1.4566973 0.9181272 1.3417342 1.4131705\n -1.0620697 0.4907245 -0.60323673 2.487476 -0.44338858 -0.01337191\n -1.5358676 0.08269311 1.17976 -1.3706315 -0.20932262 -0.7469172\n 0.5761304 -0.17463458 -1.0923884 0.02983202 0.9992851 -0.7784116\n 0.1491646 0.0532804 1.4128524 1.1143323 -0.23840122 -0.9047452\n -0.31026492 1.8315382 1.4846493 0.20744875 -1.2032983 -0.0...
2,"On 2025-06-01, Meteorologist Emily Chen speculates that the temperature at Dallas will likely increase.",1,[-1.68004587e-01 3.62666279e-01 -5.09679969e-03 -1.52042462e-02\n -3.66210565e-02 -1.76503416e-02 2.71140393e-02 1.10233976e-02\n -1.17255524e-01 1.37631774e+00 -2.74166286e-01 -1.44476595e-03\n -4.53882962e-02 -9.81117859e-02 -1.77818686e-01 2.31416486e-02\n -6.66156188e-02 1.01895070e+00 -1.28533006e-01 -3.99891064e-02\n 7.87591264e-02 1.45874396e-01 1.31233614e-02 -1.26720756e-01\n 1.07190073e-01 1.09603144e-01 -1.19070552e-01 -7.85431415e-02\n 1.93831511e-02 -6.62952056e-03 -1.78378751e-03 1.02619015e-01\n -1.82438586e-02 8.49216431e-02 5.77713624e-02 9.70404297e-02\n -3.63013409e-02 1.87408879e-01 3.40722017e-02 -1.12317458e-01\n 5.41595705e-02 -3.05524506e-02 1.15260363e-01 4.48842719e-02\n 1.96017008e-02 7.55868405e-02 -1.14714839e-01 -1.33361071e-01\n 9....,[-0.7063202 1.4100657 -0.2622988 0.04539472 -0.9869847 0.35360813\n 0.59656656 0.13019477 -2.1235096 -1.3285851 0.38927436 -0.38790524\n -1.394971 -1.6356308 -1.4006314 1.126294 -0.8845593 -0.58312404\n -0.24587017 -0.25552362 1.0483826 1.2783142 0.24031584 -0.5290378\n 1.0149059 0.53668714 0.10887118 -1.7608601 -0.05711947 -1.2690471\n 0.12304088 1.4421564 -0.242973 0.16827968 0.04961883 1.9818693\n -0.12964809 2.1383736 0.72615266 -1.3948377 0.92834866 -1.2359707\n 0.40845376 1.2313589 0.00880243 1.2765502 -0.7654839 -1.1050483\n 1.0373546 1.3464414 0.5825578 1.6416067 -0.5531219 -0.08676253\n -1.0779505 0.5230468 0.73252934 -0.7134861 0.7503297 -0.11281329\n -0.25929123 -0.15576474 0.56197953 1.0467868 -0.13869132 ...
3,"In 08/2024, Coach Michael Brown envisions that the touchdown rate at the New England Patriots has some probability to remain stable.",1,[-2.95873899e-02 1.89737260e-01 -1.18197471e-01 -4.13006060e-02\n 1.10420622e-01 -4.95679379e-02 1.91853363e-02 3.95639287e-03\n 1.09274819e-01 1.97997952e+00 -1.85330868e-01 -4.54250947e-02\n 1.08167037e-01 2.54211240e-02 -9.49313119e-03 -6.84713572e-02\n 1.55237708e-02 1.00765920e+00 -6.00808859e-03 -4.15705629e-02\n -7.41642118e-02 -1.50733357e-02 1.34443464e-02 -8.06241706e-02\n -7.69194737e-02 1.14793696e-01 -2.15540364e-01 6.73782602e-02\n 3.61875258e-02 1.65809989e-02 -4.40017954e-02 2.61284299e-02\n 4.48011309e-02 -9.68891475e-03 3.49024460e-02 9.65972338e-03\n -2.66898591e-02 7.46985758e-03 -6.69648051e-02 -3.72406021e-02\n 7.12334365e-02 6.75264001e-02 7.85919055e-02 -7.79033080e-03\n 4.30065207e-02 -5.40831983e-02 -1.17614612e-01 -2.35915799e-02\n -4....,[ 9.94810045e-01 -9.19510782e-01 -2.08958673e+00 -3.82872701e-01\n 9.79735315e-01 -1.53601065e-01 4.47641909e-01 3.84470709e-02\n 8.31526935e-01 7.36362994e-01 1.62228131e+00 -9.45445299e-01\n 8.49802375e-01 4.74268407e-01 3.33566934e-01 -6.18540466e-01\n 6.27050281e-01 -6.62382126e-01 1.79785383e+00 -2.82320827e-01\n -2.11394024e+00 -1.11011660e+00 2.45618850e-01 1.29460186e-01\n -1.80819011e+00 6.05915606e-01 -1.38124096e+00 7.15332150e-01\n 2.27245882e-01 -9.33393896e-01 -6.50393546e-01 2.23857194e-01\n 7.47982144e-01 -1.16830730e+00 -4.12921190e-01 5.06683528e-01\n -1.94259407e-03 -6.92609370e-01 -1.01476967e+00 -2.67634422e-01\n 1.23626447e+00 3.07310998e-01 -9.19480622e-02 3.36418450e-01\n 4.52263981e-01 -9.15246308e-01 -8.06111395e-01 2.38351554e-01\n -1....
4,The World Health Organization forecasts that the prevalence of chronic illnesses at urban health centers in Africa will potentially decrease in Q2 2028.,1,[-5.03914356e-02 2.15420738e-01 -1.12491660e-03 -5.48182838e-02\n -7.30580278e-03 -8.38688090e-02 -3.31079178e-02 1.37264416e-01\n 7.59466290e-02 2.07987285e+00 -4.77793008e-01 -8.09150860e-02\n 1.46376118e-01 5.20625291e-03 1.18821286e-01 7.48783117e-03\n 2.80007478e-02 1.24036634e+00 -1.96656108e-01 -9.84601304e-02\n -1.73127651e-02 1.05665714e-01 7.32700946e-03 -1.54086307e-01\n 9.32330862e-02 -5.02766855e-03 -1.62296761e-02 4.41843234e-02\n 5.19741364e-02 7.68252015e-02 -8.63392949e-02 -1.01145485e-03\n -1.59870088e-02 -3.05441557e-03 6.90341443e-02 1.68825258e-02\n 3.01963221e-02 9.55348555e-03 -2.98405183e-03 1.11071460e-01\n 5.51053137e-02 1.80116713e-01 -3.86907905e-02 -6.98386356e-02\n -1.26219615e-02 -1.10061742e-01 -1.02911144e-01 9.44003556e-03\n 7....,[ 0.7391309 -0.5735213 -0.19812785 -0.60471135 -0.59488565 -0.6986834\n -0.5345811 1.7691242 0.3967683 1.0780686 -2.4369988 -1.3953531\n 1.4083679 0.12900542 1.6555448 0.82815605 0.8566638 0.97105145\n -1.382168 -1.2462938 -0.93830127 0.6816257 0.14455377 -0.91995955\n 0.8008925 -0.9921894 1.6973933 0.32174596 0.49438855 -0.06218739\n -1.426017 -0.20841168 -0.20749934 -1.07458 0.2774166 0.6286209\n 0.75389063 -0.65982765 0.08765337 1.959122 0.94540447 2.078932\n -1.6924646 -0.71778107 -0.60175365 -1.8614447 -0.6001076 0.64260477\n -0.42163628 -1.3814554 -2.561416 -1.3002754 2.1068804 -2.3705075\n 0.52491736 1.2900217 2.3356287 0.17898723 0.69993216 -0.9291909\n 0.01578976 0.5728379 -0.21111177 -1.0165362 -0.0221873 0...
5,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .",1,[ 2.75615864e-02 1.79783881e-01 -1.21603109e-01 -1.75067633e-01\n 9.83876437e-02 -2.84420680e-02 7.04421336e-03 -1.00048631e-01\n -7.77818933e-02 2.37184715e+00 -2.96325952e-01 4.51707616e-02\n 7.32708871e-02 3.28064598e-02 -9.75645557e-02 -1.34704620e-01\n -1.46236047e-02 1.19266689e+00 -1.20833769e-01 -2.11537201e-02\n 4.45679054e-02 2.58689113e-02 -2.11139992e-02 -4.58938554e-02\n -1.82265490e-02 6.69947565e-02 -9.96832177e-02 1.51548507e-02\n 1.27175581e-02 -8.47315788e-02 -2.66740546e-02 2.18095612e-02\n 6.70658797e-02 1.45969734e-01 1.41264692e-01 6.51224284e-04\n 4.73446399e-02 -1.93405268e-03 3.11274361e-02 -1.58938453e-01\n 2.45406702e-02 6.88584819e-02 1.01522245e-01 -1.50205001e-01\n -1.53623046e-02 4.58216853e-03 -6.17812127e-02 -9.76008475e-02\n 3....,[ 1.697164 -1.0535955 -2.1446092 -2.5781236 0.81879115 0.18211456\n 0.21959545 -1.3118026 -1.6085852 2.0768263 0.08170533 0.2030413\n 0.33966735 0.60040736 -0.57380164 -1.8799998 0.07224872 0.63623625\n -0.1174464 0.06363511 0.34133762 -0.50254226 -0.32532284 0.6255889\n -0.9082055 -0.03159778 0.4083359 -0.1708655 -0.16991533 -2.3985007\n -0.3329491 0.15506867 1.0979444 1.0307212 1.7383305 0.35459948\n 0.9817363 -0.8405612 0.6754128 -2.0948052 0.39419135 0.32827145\n 0.22097439 -2.0832043 -0.6536761 0.07636706 -0.02385446 -0.6674014\n 0.10471608 1.1564453 -0.03240437 -0.64257437 -0.21797852 1.4896353\n 0.58393174 -0.34075728 0.2807242 -0.95936525 0.8074142 -0.94765747\n 1.8481007 1.4760867 0.9476612 0.77331966 0.91316414 ...
6,Business analyst John Lee forecasts that the stock prices at Tesla potentially decrease in Q3 of 2027.,1,[-2.22333133e-01 3.22087437e-01 1.42390534e-04 -2.18413901e-02\n 9.39540714e-02 -7.34969378e-02 -4.11278866e-02 -2.74882093e-02\n 1.74191091e-02 1.52008379e+00 -2.78916240e-01 -2.06297962e-03\n 1.11551337e-01 -7.08137602e-02 3.56459692e-02 -8.93433914e-02\n -9.73468423e-02 1.07365060e+00 -1.60745785e-01 -9.21461061e-02\n 1.05825521e-01 1.22747384e-01 2.82821562e-02 -1.95028305e-01\n -3.94854397e-02 1.89628348e-01 -1.89112768e-01 1.43933043e-01\n 4.67721671e-02 1.47432074e-01 -3.41654308e-02 2.31039850e-03\n 1.68670043e-02 1.55604839e-01 3.69217210e-02 1.53632299e-03\n 7.36413673e-02 1.64716244e-01 -6.01373985e-02 -3.98584083e-02\n -5.29247187e-02 7.24112317e-02 1.47015095e-01 1.28496990e-01\n 2.34927218e-02 -2.07573306e-02 -1.13614216e-01 -1.67543307e-01\n 2....,[-1.3740114 0.8634164 -0.17765288 -0.06352746 0.759491 -0.5338618\n -0.68522006 -0.3697838 -0.3667099 -0.83680415 0.32334653 -0.39574236\n 0.8992763 -1.1693901 0.79861903 -1.0160631 -1.4501053 -0.19917087\n -0.78318155 -1.139305 1.6080922 0.93511415 0.4907557 -1.5048233\n -1.2341847 1.604015 -0.9730295 2.0144176 0.4063603 0.9588764\n -0.47019118 -0.15550308 0.30890736 1.1668388 -0.37207994 0.36954197\n 1.3311347 1.7813501 -0.8971298 -0.30693814 -1.0028458 0.38417435\n 0.8417996 2.6519375 0.08252735 -0.35194334 -0.7500636 -1.5233831\n -0.07112313 -1.3734363 0.8497959 -1.2029432 0.32492235 -0.01931417\n 0.04110937 -1.073092 -1.0687623 -1.2955794 -0.30558684 -2.4666204\n -1.3809797 -1.1326911 0.30132827 0.6193158 1.6925582 ...


In [6]:
y_test_df = DataProcessing.load_from_file(y_test_set_path, 'csv')
y_test_df.drop(columns=['Unnamed: 0'], inplace=True)
print(f"\t{y_test_df.head(7)}")

	   Sentence Label
0               1
1               1
2               1
3               1
4               1
5               1
6               1


## Load Prompt

In [7]:
# prediction_properties = PredictionProperties.get_prediction_properties()
# prediction_requirements = PredictionProperties.get_requirements()
# system_identity_prompt = "You are an expert at identifying specific types of sentences by knowing the sentence format."
# prediction_examples_prompt = """Some examples of predictions in the PhraseBank dataset are
#     1. According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
#     2. According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .
#     3. Its board of directors will propose a dividend of EUR0 .12 per share for 2010 , up from the EUR0 .08 per share paid in 2009 .
# """
# non_prediction_examples_prompt = """Some examples of non-predictions in the PPhraseBank dataset are
#     1. Net sales increased to EUR193 .3 m from EUR179 .9 m and pretax profit rose by 34.2 % to EUR43 .1 m. ( EUR1 = USD1 .4 )
#     2. Net sales surged by 18.5 % to EUR167 .8 m. Teleste said that EUR20 .4 m , or 12.2 % , of the sales came from the acquisitions made in 2009 .
#     3. STORA ENSO , NORSKE SKOG , M-REAL , UPM-KYMMENE Credit Suisse First Boston ( CFSB ) raised the fair value for shares in four of the largest Nordic forestry groups .
# """
# # goal_prompt = "Given the above, identify the prediction."

# base_prompt = f"""{system_identity_prompt} The sentence format is based on: 
    
#     {prediction_properties}
#     Enforce: {prediction_requirements}
#     Know: {prediction_examples_prompt}
#     Know: {non_prediction_examples_prompt}

# """
# base_prompt

In [8]:
prompt_1 = """ 

Role: 
You are a linguist expert. You are acting as a prediction detector. Your task is to identify if a given sentence is a prediction about the future.

Background:
A prediction is a statement about what someone thinks will happen in the future.
Examples of predictions:
- "It will rain tomorrow."
- "The stock market is expected to rise next quarter."
- "I am going to the store."
- “Lakers will win the championship.”

A prediction may contain: source, target, date, outcome.
"""

## Models

In [9]:
tgmf = TextGenerationModelFactory()


# Groq Cloud (https://console.groq.com/docs/overview)
llama_318b_instant_generation_model = tgmf.create_instance('llama-3.1-8b-instant') 
llama_3370b_versatile_generation_model = tgmf.create_instance('llama-3.3-70b-versatile')  

# models  = [llama_318b_instant_generation_model]
models  = [llama_318b_instant_generation_model, llama_3370b_versatile_generation_model]


# NaviGator
# llama_31_70b_instruct = tgmf.create_instance('llama-3.1-70b-instruct') 
# mistral_small_31 = tgmf.create_instance('mistral-small-3.1') 
# models = [llama_31_70b_instruct, mistral_small_31]

In [10]:
import json
import re

def parse_json_response(response):
    """Parse JSON response from LLM to extract label and reasoning"""
    try:
        # Extract JSON if there's extra text
        json_match = re.search(r'\{.*\}', response, re.DOTALL)
        if json_match:
            data = json.loads(json_match.group())
            return data.get('label'), data.get('reasoning')
    except Exception as e:
        print(f"Error parsing JSON: {e}")
        return None, None

In [11]:
def llm_certifier(data: str, base_prompt: str, model):
    
        prompt = f""" Given this: {base_prompt}. Also given the sentence '{data}', your task is to analyze the sentence and determine if it is a prediction. If prediction, generate label as 1 and if non-prediction generate label as 0.
        Respond ONLY with valid JSON in this exact format:
        {{"label": 0, "reasoning": "your explanation here"}}
        Examples:
        - "It will rain tomorrow." → {{"label": 1, "reasoning": "Contains the future tense words 'will' and 'tomorrow'"}}
        - "The stock market is expected to rise next quarter." → {{"label": 1, "reasoning": "Contains future tense words 'is expected'"}}
        - "I am going to the store." → {{"label": 0, "reasoning": "Does not contain a future tense word"}}
        - "Lakers will win the championship." → {{"label": 1, "reasoning": "Contains the future tense word 'will'"}}
        """
        idx = 1
        if idx == 1:
            #   print(f"\tPrompt: {prompt}")
              idx = idx + 1
        input_prompt = model.user(prompt)
        raw_text_llm_generation = model.chat_completion([input_prompt])
        
        # Parse the JSON response
        label, reasoning = parse_json_response(raw_text_llm_generation)
        
        return raw_text_llm_generation, label, reasoning

In [12]:
print("======= PROMPT + MODEL -> LABEL and REASONING =======")



In [13]:
# content : meta :: text : meta_data
results = []
for idx, row in X_test_df.iterrows():
    text = row['Base Sentence']
    print('\t', idx, text)
    for model in models:
        raw_response, llm_label, llm_reasoning = llm_certifier(text, prompt_1, model)
        print('\t\t', model.__name__(), '--- Label:', llm_label, '--- Reasoning:', llm_reasoning)
        result = (text, raw_response, llm_label, llm_reasoning, model.__name__())
        results.append(result)


	 0 With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .
		 llama-3.1-8b-instant --- Label: 1 --- Reasoning: Contains future tense words 'would increase', 'would improve', and 'therefore increase' which indicate a prediction about future events
		 llama-3.3-70b-versatile --- Label: 1 --- Reasoning: Contains the future tense words 'would' and implies a future outcome of increased capacity, improved use of raw materials, and increased production profitability
	 1 The American Heart Association predicts on November 1, 2029, the obesity rates at the national level may rise.
		 llama-3.1-8b-instant --- Label: 1 --- Reasoning: Contains the future tense word 'will' and a specific date 'November 1, 2029' indicating a prediction about the future
		 llama-3.3-70b-versatile --- Label: 1 --- Reasoning: Contains the future tense words 'predicts' a

In [14]:
results

[('With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .',
  '{"label": 1, "reasoning": "Contains future tense words \'would increase\', \'would improve\', and \'therefore increase\' which indicate a prediction about future events"}',
  1,
  "Contains future tense words 'would increase', 'would improve', and 'therefore increase' which indicate a prediction about future events",
  'llama-3.1-8b-instant'),
 ('With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .',
  '{"label": 1, "reasoning": "Contains the future tense words \'would\' and implies a future outcome of increased capacity, improved use of raw materials, and increased production profitability"}',
  1,
  "Contains the future t

In [15]:
results_with_llm_label_df = pd.DataFrame(results, columns=['text', 'raw_response', 'llm_label', 'llm_reasoning', 'llm_name'])
results_with_llm_label_df

Unnamed: 0,text,raw_response,llm_label,llm_reasoning,llm_name
0,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .,"{""label"": 1, ""reasoning"": ""Contains future tense words 'would increase', 'would improve', and 'therefore increase' which indicate a prediction about future events""}",1,"Contains future tense words 'would increase', 'would improve', and 'therefore increase' which indicate a prediction about future events",llama-3.1-8b-instant
1,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .,"{""label"": 1, ""reasoning"": ""Contains the future tense words 'would' and implies a future outcome of increased capacity, improved use of raw materials, and increased production profitability""}",1,"Contains the future tense words 'would' and implies a future outcome of increased capacity, improved use of raw materials, and increased production profitability",llama-3.3-70b-versatile
2,"The American Heart Association predicts on November 1, 2029, the obesity rates at the national level may rise.","{""label"": 1, ""reasoning"": ""Contains the future tense word 'will' and a specific date 'November 1, 2029' indicating a prediction about the future""}",1,"Contains the future tense word 'will' and a specific date 'November 1, 2029' indicating a prediction about the future",llama-3.1-8b-instant
3,"The American Heart Association predicts on November 1, 2029, the obesity rates at the national level may rise.","{""label"": 1, ""reasoning"": ""Contains the future tense words 'predicts' and 'may rise', and includes a specific future date 'November 1, 2029', indicating a statement about a future outcome""}",1,"Contains the future tense words 'predicts' and 'may rise', and includes a specific future date 'November 1, 2029', indicating a statement about a future outcome",llama-3.3-70b-versatile
4,"On 2025-06-01, Meteorologist Emily Chen speculates that the temperature at Dallas will likely increase.","{""label"": 1, ""reasoning"": ""Contains the future tense words 'will likely increase' and specifies a target (temperature at Dallas) and a date (2025-06-01)""}",1,Contains the future tense words 'will likely increase' and specifies a target (temperature at Dallas) and a date (2025-06-01),llama-3.1-8b-instant
5,"On 2025-06-01, Meteorologist Emily Chen speculates that the temperature at Dallas will likely increase.","{""label"": 1, ""reasoning"": ""Contains the future tense words 'will likely increase' and a specific date '2025-06-01', indicating a prediction about a future event""}",1,"Contains the future tense words 'will likely increase' and a specific date '2025-06-01', indicating a prediction about a future event",llama-3.3-70b-versatile
6,"In 08/2024, Coach Michael Brown envisions that the touchdown rate at the New England Patriots has some probability to remain stable.","{""label"": 1, ""reasoning"": ""Contains the future tense phrase 'envision that' and a specific date '08/2024' which indicates a prediction about the future.""}",1,Contains the future tense phrase 'envision that' and a specific date '08/2024' which indicates a prediction about the future.,llama-3.1-8b-instant
7,"In 08/2024, Coach Michael Brown envisions that the touchdown rate at the New England Patriots has some probability to remain stable.","{""label"": 1, ""reasoning"": ""Contains the future tense words 'envisions' and a specific date '08/2024', indicating a prediction about the future outcome of the touchdown rate at the New England Patriots""}",1,"Contains the future tense words 'envisions' and a specific date '08/2024', indicating a prediction about the future outcome of the touchdown rate at the New England Patriots",llama-3.3-70b-versatile
8,The World Health Organization forecasts that the prevalence of chronic illnesses at urban health centers in Africa will potentially decrease in Q2 2028.,"{""label"": 1, ""reasoning"": ""Contains the future tense word 'will', a target (prevalence of chronic illnesses), a date (Q2 2028), and an outcome (potentially decrease), which are all indicators of a prediction.""}",1,"Contains the future tense word 'will', a target (prevalence of chronic illnesses), a date (Q2 2028), and an outcome (potentially decrease), which are all indicators of a prediction.",llama-3.1-8b-instant
9,The World Health Organization forecasts that the prevalence of chronic illnesses at urban health centers in Africa will potentially decrease in Q2 2028.,"{""label"": 1, ""reasoning"": ""Contains the future tense words 'will' and 'forecasts', and a specific date 'Q2 2028', indicating a prediction about the future""}",1,"Contains the future tense words 'will' and 'forecasts', and a specific date 'Q2 2028', indicating a prediction about the future",llama-3.3-70b-versatile


In [16]:
def get_llm_labels(df, model_name):
    filt_llama = (df['llm_name'] == model_name)
    filt_df = df[filt_llama]
    return filt_df['llm_label']

llama_instant_labels = get_llm_labels(results_with_llm_label_df, 'llama-3.1-8b-instant')
llama_versatile_labels = get_llm_labels(results_with_llm_label_df, 'llama-3.3-70b-versatile')
print(f"\tllama-3.1-8b-instant: {llama_instant_labels}")
print(f"\tllama-3.3-70b-versatile: {llama_versatile_labels}")

	llama-3.1-8b-instant: 0     1
2     1
4     1
6     1
8     1
10    0
12    1
14    1
16    1
18    1
20    0
22    0
24    0
26    1
28    0
30    1
32    1
34    1
36    1
38    1
40    1
Name: llm_label, dtype: int64
	llama-3.3-70b-versatile: 1     1
3     1
5     1
7     1
9     1
11    0
13    1
15    1
17    1
19    1
21    0
23    0
25    0
27    1
29    0
31    1
33    1
35    1
37    1
39    1
41    1
Name: llm_label, dtype: int64


In [17]:
model_predictions_df = pd.concat([X_test_df['Base Sentence'], y_test_df], axis=1)
model_predictions_df.columns = ['Sentence', 'Actual Label']
model_predictions_df['Instant'] = llama_instant_labels.to_numpy().ravel()
model_predictions_df['Versatile Label'] = llama_versatile_labels.to_numpy().ravel()
model_predictions_df

# print(f"{model_predictions_df}")

Unnamed: 0,Sentence,Actual Label,Instant,Versatile Label
0,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .,1,1,1
1,"The American Heart Association predicts on November 1, 2029, the obesity rates at the national level may rise.",1,1,1
2,"On 2025-06-01, Meteorologist Emily Chen speculates that the temperature at Dallas will likely increase.",1,1,1
3,"In 08/2024, Coach Michael Brown envisions that the touchdown rate at the New England Patriots has some probability to remain stable.",1,1,1
4,The World Health Organization forecasts that the prevalence of chronic illnesses at urban health centers in Africa will potentially decrease in Q2 2028.,1,1,1
5,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .",1,0,0
6,Business analyst John Lee forecasts that the stock prices at Tesla potentially decrease in Q3 of 2027.,1,1,1
7,"Dr. David Kim predicts on 21 August 2024, the average salary at Google may rise.",1,1,1
8,"On 08/20/2024, Analyst Kevin White speculates the win rate at the Los Angeles Rams will likely increase.",1,1,1
9,"The corporate tax rate in the healthcare sector should stay the same in 2026, according to policy analyst Emily Wong.",1,1,1


## Evaluation

In [18]:
print("======= EVALUATION/RESULTS =======")



In [19]:
get_metrics = EvaluationMetric()
get_metrics

<classification_models.EvaluationMetric at 0x32a8885d0>

In [20]:
metrics = get_metrics.eval_classification_report(y_test_df, llama_instant_labels)
metrics

              precision    recall  f1-score   support

           0       0.80      1.00      0.89         4
           1       1.00      0.94      0.97        17

    accuracy                           0.95        21
   macro avg       0.90      0.97      0.93        21
weighted avg       0.96      0.95      0.95        21



In [21]:
metrics = get_metrics.eval_classification_report(y_test_df, llama_versatile_labels)
metrics

              precision    recall  f1-score   support

           0       0.80      1.00      0.89         4
           1       1.00      0.94      0.97        17

    accuracy                           0.95        21
   macro avg       0.90      0.97      0.93        21
weighted avg       0.96      0.95      0.95        21

