In [56]:
from datasets import load_dataset_builder, load_dataset, get_dataset_infos, get_dataset_config_names, list_datasets
from langchain.prompts import load_prompt
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from langchain.chat_models import ChatOpenAI
from operator import itemgetter
from sklearn.model_selection import train_test_split
import yaml
import pandas as pd
import numpy as np

In [57]:
# list_datasets()

# Data

In [92]:
def load_dataset_by_name(name):
    df = pd.read_csv(f"data/{name}_data.csv")
    df.columns.values[0] = 'text'
    df.columns.values[1] = 'class'      
    df = df.dropna()
    return df

In [133]:
def split_datasets(df_dict, test_size=0.2):
    df_dict_train = {}
    df_dict_test = {}

    for dataset, df in df_dict.items():
        df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
        df_dict_train[dataset] = df_train
        df_dict_test[dataset] = df_test
    return df_dict_train, df_dict_test

def construct_example_strings(df):
    df_dict_examples = {}
    for dataset_name, dataset_df in df.items():
        df_dict_examples[dataset_name] = dataset_df.apply(lambda x: f"{x['text']}[{x['class']}]", axis=1).tolist()
    return df_dict_examples

In [134]:
my_datasets = ["lowercase", "imdb_numbers", "imdb_digits", "backpack"]
df_dict = {name: load_dataset_by_name(name) for name in my_datasets}

dict_train, dict_test = split_datasets(df_dict)
dict_examples = construct_example_strings(dict_train)

# Model

In [147]:
def get_examples(x):
    dataset = x['dataset']
    examples = dict_examples[dataset]

    if 'n_examples' in x:
        n_examples = x['n_examples']
        examples = examples[:n_examples]

    "\n".join(examples)
    return examples

In [148]:
# model
model_types = {"gpt-4": "gpt-4-1106-preview", "gpt-3":"gpt-3.5-turbo"}
model = ChatOpenAI(model=model_types['gpt-4'])

In [154]:
# prompt
template_path = "templates/classification_2.yaml"
prompt_template = load_prompt(template_path)

In [155]:
chain = (
    RunnablePassthrough.assign(
        examples=RunnableLambda(get_examples)
    )
    | prompt_template
    | model
    | StrOutputParser()
)

In [156]:
n_test = 20
test_dataset_name = "backpack"
test = dict_test[test_dataset_name][:n_test]



In [157]:
# input = "\n".join([f"{i+1}. {sentence}" for i, sentence in enumerate(test['text'].tolist())])
# def classify_many():
#     # input = "\n".join([f"{i+1}. {sentence}" for i, sentence in enumerate(test['text'].tolist())])
#     inputs = {"input": input, "dataset": test_dataset_name, "n_examples": 100}
#     pred = chain.invoke(inputs)
#     return pred

# pred = classify_many()

# def classify_one(sentence):
#     inputs = {"input": sentence, "dataset": test_dataset_name, "n_examples": 100}
#     pred = chain.invoke(inputs)
#     return pred

In [158]:
results = []
for index, row in test.iterrows():
    text = row['text']
    ground_truth = row['class']
    
    inputs = {"input": text, "dataset": test_dataset_name}
    pred = chain.invoke(inputs)
    results.append((text, ground_truth, pred))

    print(text)
    print(f"predicted: {pred}")
    print(f"actual: {ground_truth}")
    print()

results_df = pd.DataFrame(results, columns=["text", "ground_truth", "prediction"])

Sweatshirt, reusable water bottle, granola bars, multitool, lighter
predicted: False
actual:  "True"

Mittens, scarf, beanie, thermos, chocolate bar
predicted: "True"
actual:  "True"

Chalk, climbing shoes, harness, carabiners, belay device
predicted: True
actual:  "False"

Cycling jersey, helmet, sunglasses, tire repair kit, energy bar
predicted: True
actual:  "True"

Beanie, thermal flask, trail map, compass, snack
predicted: "True"
actual:  "True"

Board games, dice, rulebook, scorepad, pencils
predicted: False
actual:  "False"

Telescope, star chart, notebook, pen, thermos
predicted: False
actual:  "False"

Bandana, flashlight, pocket knife, fire starter, energy bars
predicted: True
actual:  "True"

Magic tricks, deck of cards, rabbit, top hat, wand
predicted: False
actual:  "False"

Rain jacket, portable charger, earphones, diary, pen
predicted: "True"
actual:  "True"



In [158]:
results_df

Unnamed: 0,text,ground_truth,prediction
0,Tea tastes BETTER with cookies.,False,Classification: True\n\nExplanation: The sente...
1,Travel broadens the MIND.,False,Classification: True\n\nExplanation: The sente...
2,Children LOVE to play games.,False,Classification: True\n\nExplanation: The sente...
3,LIFE IS full OF SURPRISES.,False,Classification: True\n\nExplanation: This sent...
4,LEARNING TO COOK can be fun.,False,Classification: True\n\nExplanation: The sente...
5,Good COMMUNICATION skills are crucial.,False,Classification: False\n\nExplanation: The sent...
6,Learning new languages is INTERESTING.,False,Classification: False\n\nExplanation: The sent...
7,Healthy EATING IS important for WELL-BEING.,False,Classification: True\n\nExplanation: The sente...
8,i enjoy cooking new recipes.,True,Classification: True\n\nExplanation: The sente...
9,Photography is an ART.,False,Classification: False\n\nExplanation: The sent...
