In [1]:
import pandas as pd
import numpy as np
from ruamel.yaml import YAML
from langchain.prompts import PromptTemplate
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score

from utils.loader import llm, emb_fn, strings
from utils.utils import prepare_data, set_all_seeds, combine_gradient_prompt, save_predictions
from tqdm import tqdm

In [2]:
set_all_seeds(42)

### Data

In [3]:
prepare_data('spam', '/home/umbilnm/python_ml/AutomatizationPromptEngeneering/data/spam.csv', [100, 1000]) 
df = pd.read_csv('/home/umbilnm/python_ml/AutomatizationPromptEngeneering/data/spam_100.csv').drop(
    columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4']
)
messages = df['v2']

### Model

In [4]:
template = strings['templates']['prediction_template']
template = PromptTemplate(template=template, input_variables=['message'])

preds = []
for message in tqdm(messages):
    prediction = llm.invoke(template.format(message=message)).content
    preds.append(prediction)
    df['predictions'] = 'ham' if prediction=='No' else 'spam'

100%|██████████| 100/100 [00:53<00:00,  1.86it/s]


In [5]:
df['predictions'] = preds
df['predictions'] = df['predictions'].apply(lambda x: 0 if x=='No' else 1)
df['v1'] = df['v1'].apply(lambda x: 0 if x=='ham' else 1)

In [3]:
preds_df = pd.read_csv('data/predistions/first_predictions.csv')
df = pd.read_csv('data/spam_100.csv').drop(columns=[f'Unnamed: {i}' for i in range(2, 5)])
df['preds'] = preds_df['predictions']
df['v1'] = df['v1'].apply(lambda x: 1 if x=='spam' else 0)

In [5]:
df

Unnamed: 0,v1,v2,preds
0,0,"Funny fact Nobody teaches volcanoes 2 erupt, t...",0
1,0,I sent my scores to sophas and i had to do sec...,0
2,1,We know someone who you know that fancies you....,1
3,0,Only if you promise your getting out as SOON a...,0
4,1,Congratulations ur awarded either еЈ500 of CD ...,1
...,...,...,...
95,0,Mum say we wan to go then go... Then she can s...,0
96,0,"Did you say bold, then torch later. Or one tor...",0
97,0,Not yet chikku..wat abt u?,0
98,0,What was she looking for?,0


In [4]:
print(combine_gradient_prompt(df=df, true_col='v1', pred_col='preds', text_col='v2'))

There are examples where models answer is incorrect:
0) Funny fact Nobody teaches volcanoes 2 erupt, tsunamis 2 arise, hurricanes 2 sway aroundn no 1 teaches hw 2 choose a wife Natural disasters just happens
True: ham
Predicted: ham
------------------------------
0) Funny fact Nobody teaches volcanoes 2 erupt, tsunamis 2 arise, hurricanes 2 sway aroundn no 1 teaches hw 2 choose a wife Natural disasters just happens
True: ham
Predicted: ham
------------------------------
1) I sent my scores to sophas and i had to do secondary application for a few schools. I think if you are thinking of applying, do a research on cost also. Contact joke ogunrinde, her school is one me the less expensive ones
True: ham
Predicted: ham
------------------------------
1) I sent my scores to sophas and i had to do secondary application for a few schools. I think if you are thinking of applying, do a research on cost also. Contact joke ogunrinde, her school is one me the less expensive ones
True: ham
Predicted

### Metrics

In [66]:
accuracy = accuracy_score(df['v1'], df['predictions'])
recall = recall_score(df['v1'], df['predictions'])
f1 = f1_score(df['v1'], df['predictions'])
precision = precision_score(df['v1'], df['predictions'])
print(f'Accuracy = {accuracy:.2f}\nRecall = {recall:.2f}\nPrecision = {precision:.2f}\nF1 = {f1:.2f}')

Accuracy = 0.82
Recall = 1.00
Precision = 0.40
F1 = 0.57
