Importing all the things that are used in this notebook.

In [None]:
from google.colab import drive

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from huggingface_hub import login
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from huggingface_hub import hf_hub_download
import pandas as pd

!pip install -U sentence-transformers

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

from tqdm import tqdm
tqdm.pandas()


### Mounting Google drive, and huggingFace

Mount the google drive so to stores the data after each stage of the experiment.

Also authorize huggingface with a token (to be fetched from huggingface) so it can download the dataset or models from huggingface. Note that, you probably have to accept terms of a dataset or a model before using it.

In [None]:
drive.mount('/content/drive')

In [None]:

hf_token = os.environ.get("HUGGINGFACE_TOKEN")


login(token=hf_token)

### Download the model
You can change the model to any available AutoModelForCausalLLM model on huggingFace, by changing model_id.

In [None]:

model_id = "Qwen/Qwen2.5-3B"

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)


### Download the dataset, some analysis, and find classification classes

In [None]:

# Define folder path
data_path = "/home/data"

# Create the folder if it doesn't exist
os.makedirs(data_path, exist_ok=True)



test_file_path = hf_hub_download(repo_id="murathankurfali/ClimateEval",subfolder='exeter/sub_claim', filename="test.csv", repo_type="dataset")
train_file_path = hf_hub_download(repo_id="murathankurfali/ClimateEval",subfolder='exeter/sub_claim', filename="training.csv", repo_type="dataset")
val_file_path = hf_hub_download(repo_id="murathankurfali/ClimateEval",subfolder='exeter/sub_claim', filename="validation.csv", repo_type="dataset")

test_csv = pd.read_csv(test_file_path)
train_csv = pd.read_csv(train_file_path)
val_csv = pd.read_csv(val_file_path)

print(train_csv.columns)
print(train_csv['sub_claim'].value_counts())
print(train_csv['sub_claim_code'].value_counts())
print( list(zip(train_csv['sub_claim_code'], train_csv['sub_claim'])))
print(f'len of codes are: {len(train_csv["sub_claim_code"])} and len of strings are: {len(train_csv["sub_claim"])}')
print(f'len of test is: {test_csv.shape[0]}')


# creating the sorted list of class labels
counts = train_csv["sub_claim_code"].value_counts().sort_index()

code_to_description = train_csv.drop_duplicates("sub_claim_code").set_index("sub_claim_code")["sub_claim"]

combined = [(code, code_to_description[code]) for code, count in counts.items()]

classes_strings = []
# Display sorted result
for code, desc in combined:
    print(f"{code}: {desc}")
    classes_strings.append(f"{code}: {desc}")

print(classes_strings)
claim_codes = train_csv['sub_claim_code'].unique()
print(claim_codes)




### Classify a given text

Given a prompt with example, classes, and query, the classify_text creates the prompt and use the generate_response (which use the LLM to generate the response), and find the class in the generated response.

In [None]:
# Generate text
def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=32)

    input_length = inputs.input_ids.shape[-1]
    generated_tokens = outputs[0][input_length:]

    #response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    return response


def classify_text(text, examples = None):

    if examples is None:
        prompt = f"Classify the following text into one of these categories: {', '.join(classes_strings)}.\nText: \"{text}\"\n Classification result is:"
    else:
        prompt = f"Given the examples: {', '.join(examples)}\n Classify the following text into one of these categories: {', '.join(classes_strings)}.\nText: \"{text}\"\n Classification result is:"
    #prompt = f"Classify the following text into one of these categories: {', '.join(classes_strings)}.\nText: \"{text}\"\n Classification result is:"


    response = generate_response(prompt)
    #print(f'the response is:  {response}')
    predicted_code = "Unknown" # Default value

    for code in claim_codes:
        if code.lower() in response.lower():
            predicted_code = code
            break # Assuming only one category should match
    #print(f'The predicted code is: {predicted_code}')
    return predicted_code




print(generate_response('tell me the truth, is this prompt a test?'))

### Example prompting and metrics
examplePromptCreator creates a list of prompt string from examples. The generated strings are then used in the classify_text function.

The calculate metrics also call the classification_report of sklearn.

In [None]:
def examplePromptCreator(rawExamples):
    examples_string= []
    for i, match in enumerate(rawExamples):
        examples_string.append(f" Example text: \"{match['text']}\" classification result is: {match['sub_claim_code']}: {match['sub_claim']} ")
    return examples_string

def calculate_metrics(y_true, y_pred):
    print(classification_report(y_true, y_pred, target_names=sorted(y_true.unique())))


# Performing three stages of experiment
Note that, at the end of each stage, the results are saved in test_csv and stored on google drive. To continue experiment from the middle, you can uncomment the line that reads the csv (last cell  before each stage), and continue the rest.

# First stage: Zero-shot

In [None]:

test_csv['predicted_sub_claim'] = test_csv['text'].progress_apply(classify_text)

print("Comparing actual and predicted sub_claims (first 5 rows):")
print(test_csv[['text', 'sub_claim', 'predicted_sub_claim']].head())



# Assume you have two columns: y_true and y_pred
y_true = test_csv["sub_claim_code"]      # true labels column
y_pred = test_csv["predicted_sub_claim"]   # predicted labels column

calculate_metrics(y_true, y_pred)

test_csv.to_csv("/content/drive/MyDrive/riseNLP/zeroShotTest.csv", index=False)

In [None]:

#test_csv = pd.read_csv("/content/drive/MyDrive/riseNLP/zeroShotTest.csv")

# Second stage: Few shot

In [None]:
#FEW SHOT PROMPT

unique_examples = test_csv.drop_duplicates(subset="sub_claim_code")

examples= []

for _,row in unique_examples.iterrows():
  examples.append({
        "text": row["text"],
        "sub_claim": row["sub_claim"],
        "sub_claim_code": row["sub_claim_code"] })

random_few_shot_examples = examplePromptCreator(examples)

def classify_text_with_extra_shots_random(text):
    examples = random_few_shot_examples
    predicted_code = classify_text(text, examples = examples)
    return predicted_code


print("Now testing with random few shots!")
test_csv['predicted_sub_claim_random'] = test_csv['text'].progress_apply(classify_text_with_extra_shots_random)

test_csv.to_csv("/content/drive/MyDrive/riseNLP/testwithResultsRandomShots.csv", index=False)


In [None]:
#test_csv = pd.read_csv("/content/drive/MyDrive/riseNLP/testwithResultsRandomShots.csv")

# Third stage: RAG

In [None]:
rag_top_k_number= 5


text_encoder_model = SentenceTransformer("all-MiniLM-L6-v2")

text_embeddings = text_encoder_model.encode(train_csv["text"].tolist(), convert_to_tensor=True).cpu().numpy()

def retrieve_top_k_classes(query_text, k=5):
    query_embedding = text_encoder_model.encode([query_text], convert_to_tensor=True).cpu().numpy()

    similarity_scores = cosine_similarity(query_embedding, text_embeddings)[0]

    # Get top-k indices
    top_k_indices = np.argsort(similarity_scores)[-k:][::-1]

    results = []
    for idx in top_k_indices:
        results.append({
            "text": train_csv.loc[idx, "text"],
            "sub_claim_code": train_csv.loc[idx, "sub_claim_code"],
            'sub_claim': train_csv.loc[idx, "sub_claim"],
            "similarity_score": similarity_scores[idx]
        })
    #print(results)
    return results



def classify_text_with_extra_shots_rag(text):
    top_matches = retrieve_top_k_classes(text, k=rag_top_k_number)
    examples = examplePromptCreator(top_matches)
    predicted_code = classify_text(text, examples = examples)
    return predicted_code



test_csv = test_csv.dropna(subset=["text"])

print("Now testing with rag few shots!")
test_csv['predicted_sub_claim_rag'] = test_csv['text'].progress_apply(classify_text_with_extra_shots_rag)

test_csv.to_csv("/content/drive/MyDrive/riseNLP/testwithResultsRag.csv", index=False)


'''
# Example usage
query = "What is the future of renewable energy?"
top_matches = retrieve_top_k_classes(query, k=5)

# Display the results
for i, match in enumerate(top_matches, 3):
    print(f"\nTop {i}")
    print("Matched Text:", match["text"])
    print("Text Class:", match["sub_claim_code"])
    print("Claim:", match["sub_claim"])
    print(f"Similarity Score: {match['similarity_score']:.4f}")

'''




In [None]:
#test_csv = pd.read_csv("/content/drive/MyDrive/riseNLP/testwithResultsRag.csv")

# Results
Having test_csv with all saved results in their corresponding columns, below we compare all the results.

In [None]:
print("Results of zero shot!--------------------------------------")
y_true = test_csv["sub_claim_code"]      # true labels column
y_pred = test_csv["predicted_sub_claim"]   # predicted labels column
calculate_metrics(y_true, y_pred)



print("Results of few random shot!--------------------------------------")
y_true = test_csv["sub_claim_code"]      # true labels column
y_pred = test_csv["predicted_sub_claim_random"]   # predicted labels column
calculate_metrics(y_true, y_pred)


print("Results of few shots with RAG!--------------------------------------")
y_true = test_csv["sub_claim_code"]      # true labels column
y_pred = test_csv["predicted_sub_claim_rag"]   # predicted labels column
calculate_metrics(y_true, y_pred)