Importing all the things that are used in this notebook.

In [None]:
from google.colab import drive

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from huggingface_hub import login
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from huggingface_hub import hf_hub_download
import pandas as pd

!pip install -U sentence-transformers

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

from tqdm import tqdm
tqdm.pandas()


Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_

### Mounting Google drive, and huggingFace

Mount the google drive so to stores the data after each stage of the experiment.

Also authorize huggingface with a token (to be fetched from huggingface) so it can download the dataset or models from huggingface. Note that, you probably have to accept terms of a dataset or a model before using it.

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

hf_token = os.environ.get("HUGGINGFACE_TOKEN")


login(token=hf_token)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Download the model
You can change the model to any available AutoModelForCausalLLM model on huggingFace, by changing model_id.

In [None]:

model_id = "Qwen/Qwen2.5-3B"

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/683 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

### Download the dataset, some analysis, and find classification classes

In [None]:

# Define folder path
data_path = "/home/data"

# Create the folder if it doesn't exist
os.makedirs(data_path, exist_ok=True)



test_file_path = hf_hub_download(repo_id="murathankurfali/ClimateEval",subfolder='exeter/sub_claim', filename="test.csv", repo_type="dataset")
train_file_path = hf_hub_download(repo_id="murathankurfali/ClimateEval",subfolder='exeter/sub_claim', filename="training.csv", repo_type="dataset")
val_file_path = hf_hub_download(repo_id="murathankurfali/ClimateEval",subfolder='exeter/sub_claim', filename="validation.csv", repo_type="dataset")

test_csv = pd.read_csv(test_file_path)
train_csv = pd.read_csv(train_file_path)
val_csv = pd.read_csv(val_file_path)

print(train_csv.columns)
print(train_csv['sub_claim'].value_counts())
print(train_csv['sub_claim_code'].value_counts())
print( list(zip(train_csv['sub_claim_code'], train_csv['sub_claim'])))
print(f'len of codes are: {len(train_csv["sub_claim_code"])} and len of strings are: {len(train_csv["sub_claim"])}')
print(f'len of test is: {test_csv.shape[0]}')


# creating the sorted list of class labels
counts = train_csv["sub_claim_code"].value_counts().sort_index()

code_to_description = train_csv.drop_duplicates("sub_claim_code").set_index("sub_claim_code")["sub_claim"]

combined = [(code, code_to_description[code]) for code, count in counts.items()]

classes_strings = []
# Display sorted result
for code, desc in combined:
    print(f"{code}: {desc}")
    classes_strings.append(f"{code}: {desc}")

print(classes_strings)
claim_codes = train_csv['sub_claim_code'].unique()
print(claim_codes)




test.csv:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

training.csv:   0%|          | 0.00/7.85M [00:00<?, ?B/s]

validation.csv:   0%|          | 0.00/877k [00:00<?, ?B/s]

Index(['text', 'sub_claim_code', 'sub_claim'], dtype='object')
sub_claim
No claim                                                                                  16302
Climate-related science is unreliable/uncertain/unsound (data, methods & models)           1373
Climate movement is unreliable/alarmist/corrupt                                            1014
It’s natural cycles/variation                                                               788
Climate hasn’t warmed/changed over the last (few) decade(s)                                 483
Extreme weather isn’t increasing/has happened before/isn’t linked to climate change         427
There’s no evidence for greenhouse effect/carbon dioxide driving climate change             339
Species/plants/reefs aren’t showing climate impacts/are benefiting from climate change      337
Ice/permafrost/snow cover isn’t melting                                                     333
Climate policies (mitigation or adaptation) are harmful        

'\nfiles = ["train.csv", "validation.csv", "test.csv"]  # or whatever is present\n\nfor file in files:\n    r = requests.get( \'https://huggingface.co/datasets/murathankurfali/ClimateEval/tree/main/exeter/sub_claim/\'+ file)\n    with open(data_path+file, "wb") as f:\n        f.write(r.content)\n'

### Classify a given text

Given a prompt with example, classes, and query, the classify_text creates the prompt and use the generate_response (which use the LLM to generate the response), and find the class in the generated response.

In [None]:
# Generate text
def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=32)

    input_length = inputs.input_ids.shape[-1]
    generated_tokens = outputs[0][input_length:]

    #response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    return response


def classify_text(text, examples = None):

    if examples is None:
        prompt = f"Classify the following text into one of these categories: {', '.join(classes_strings)}.\nText: \"{text}\"\n Classification result is:"
    else:
        prompt = f"Given the examples: {', '.join(examples)}\n Classify the following text into one of these categories: {', '.join(classes_strings)}.\nText: \"{text}\"\n Classification result is:"
    #prompt = f"Classify the following text into one of these categories: {', '.join(classes_strings)}.\nText: \"{text}\"\n Classification result is:"


    response = generate_response(prompt)
    #print(f'the response is:  {response}')
    predicted_code = "Unknown" # Default value

    for code in claim_codes:
        if code.lower() in response.lower():
            predicted_code = code
            break # Assuming only one category should match
    #print(f'The predicted code is: {predicted_code}')
    return predicted_code




print(generate_response('tell me the truth, is this prompt a test?'))

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


 I'm sorry, but I'm not sure what you mean by "this prompt." Could you please provide more context or information about the prompt you're referring to


### Example prompting and metrics
examplePromptCreator creates a list of prompt string from examples. The generated strings are then used in the classify_text function.

The calculate metrics also call the classification_report of sklearn.

In [None]:
def examplePromptCreator(rawExamples):
    examples_string= []
    for i, match in enumerate(rawExamples):
        examples_string.append(f" Example text: \"{match['text']}\" classification result is: {match['sub_claim_code']}: {match['sub_claim']} ")
    return examples_string

def calculate_metrics(y_true, y_pred):
    print(classification_report(y_true, y_pred, target_names=sorted(y_true.unique())))


# Performing three stages of experiment
Note that, at the end of each stage, the results are saved in test_csv and stored on google drive. To continue experiment from the middle, you can uncomment the line that reads the csv (last cell  before each stage), and continue the rest.

# First stage: Zero-shot

In [None]:

test_csv['predicted_sub_claim'] = test_csv['text'].progress_apply(classify_text)

print("Comparing actual and predicted sub_claims (first 5 rows):")
print(test_csv[['text', 'sub_claim', 'predicted_sub_claim']].head())



# Assume you have two columns: y_true and y_pred
y_true = test_csv["sub_claim_code"]      # true labels column
y_pred = test_csv["predicted_sub_claim"]   # predicted labels column

calculate_metrics(y_true, y_pred)

test_csv.to_csv("/content/drive/MyDrive/riseNLP/zeroShotTest.csv", index=False)

  0%|          | 0/2904 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 2/2904 [00:01<26:07,  1.85it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 3/2904 [00:02<36:16,  1.33it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 4/2904 [00:02<37:18,  1.30it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 5/2904 [00:03<37:43,  1.28it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 6/2904 [00:04<43:26,  1.11it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 7/2904 [00:05<42:13,  1.14it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 8/2904 [00:06<44:15,  1.09it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 9/2904 [00:08<52:06,  1

Comparing actual and predicted sub_claims (first 5 rows):
                                                text  \
0  The discussion is far from over. The true mind...   
1  NASA's James Hansen claims extreme weather eve...   
2  Figure 1. Lake Baikal paleoclimate record from...   
3  Nuccitellis answer to this unexceptionable and...   
4  It makes you wonder what created all that CO2 ...   

                                           sub_claim predicted_sub_claim  
0  Climate-related science is unreliable/uncertai...                 1_4  
1                                           No claim                 1_7  
2                                           No claim                 1_1  
3                      It’s natural cycles/variation                 1_1  
4                      It’s natural cycles/variation                 5_1  
              precision    recall  f1-score   support

         0_0       0.89      0.54      0.68      1754
         1_1       0.07      0.96      0.13   

In [None]:

#test_csv = pd.read_csv("/content/drive/MyDrive/riseNLP/zeroShotTest.csv")

# Second stage: Few shot

In [None]:
#FEW SHOT PROMPT

unique_examples = test_csv.drop_duplicates(subset="sub_claim_code")

examples= []

for _,row in unique_examples.iterrows():
  examples.append({
        "text": row["text"],
        "sub_claim": row["sub_claim"],
        "sub_claim_code": row["sub_claim_code"] })

random_few_shot_examples = examplePromptCreator(examples)

def classify_text_with_extra_shots_random(text):
    examples = random_few_shot_examples
    predicted_code = classify_text(text, examples = examples)
    return predicted_code


print("Now testing with random few shots!")
test_csv['predicted_sub_claim_random'] = test_csv['text'].progress_apply(classify_text_with_extra_shots_random)

test_csv.to_csv("/content/drive/MyDrive/riseNLP/testwithResultsRandomShots.csv", index=False)


Now testing with random few shots!


  0%|          | 0/2904 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 2/2904 [00:02<1:07:11,  1.39s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 3/2904 [00:05<1:25:24,  1.77s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 4/2904 [00:07<1:37:12,  2.01s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 5/2904 [00:10<1:52:28,  2.33s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 6/2904 [00:13<1:58:54,  2.46s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 7/2904 [00:15<1:54:17,  2.37s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 8/2904 [00:17<1:53:42,  2.36s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 9/2904 [0

In [None]:
#test_csv = pd.read_csv("/content/drive/MyDrive/riseNLP/testwithResultsRandomShots.csv")

# Third stage: RAG

In [None]:
rag_top_k_number= 5


text_encoder_model = SentenceTransformer("all-MiniLM-L6-v2")

text_embeddings = text_encoder_model.encode(train_csv["text"].tolist(), convert_to_tensor=True).cpu().numpy()

def retrieve_top_k_classes(query_text, k=5):
    query_embedding = text_encoder_model.encode([query_text], convert_to_tensor=True).cpu().numpy()

    similarity_scores = cosine_similarity(query_embedding, text_embeddings)[0]

    # Get top-k indices
    top_k_indices = np.argsort(similarity_scores)[-k:][::-1]

    results = []
    for idx in top_k_indices:
        results.append({
            "text": train_csv.loc[idx, "text"],
            "sub_claim_code": train_csv.loc[idx, "sub_claim_code"],
            'sub_claim': train_csv.loc[idx, "sub_claim"],
            "similarity_score": similarity_scores[idx]
        })
    #print(results)
    return results



def classify_text_with_extra_shots_rag(text):
    top_matches = retrieve_top_k_classes(text, k=rag_top_k_number)
    examples = examplePromptCreator(top_matches)
    predicted_code = classify_text(text, examples = examples)
    return predicted_code



test_csv = test_csv.dropna(subset=["text"])

print("Now testing with rag few shots!")
test_csv['predicted_sub_claim_rag'] = test_csv['text'].progress_apply(classify_text_with_extra_shots_rag)

test_csv.to_csv("/content/drive/MyDrive/riseNLP/testwithResultsRag.csv", index=False)


'''
# Example usage
query = "What is the future of renewable energy?"
top_matches = retrieve_top_k_classes(query, k=5)

# Display the results
for i, match in enumerate(top_matches, 3):
    print(f"\nTop {i}")
    print("Matched Text:", match["text"])
    print("Text Class:", match["sub_claim_code"])
    print("Claim:", match["sub_claim"])
    print(f"Similarity Score: {match['similarity_score']:.4f}")

'''




modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Now testing with rag few shots!


  0%|          | 0/2898 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 2/2898 [00:01<37:16,  1.30it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 3/2898 [00:02<46:34,  1.04it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 4/2898 [00:03<47:40,  1.01it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 5/2898 [00:04<48:53,  1.01s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 6/2898 [00:05<49:13,  1.02s/it]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 7/2898 [00:06<42:51,  1.12it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 8/2898 [00:07<45:52,  1.05it/s]Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
  0%|          | 9/2898 [00:09<52:53,  1

'\n# Example usage\nquery = "What is the future of renewable energy?"\ntop_matches = retrieve_top_k_classes(query, k=5)\n\n# Display the results\nfor i, match in enumerate(top_matches, 3):\n    print(f"\nTop {i}")\n    print("Matched Text:", match["text"])\n    print("Text Class:", match["sub_claim_code"])\n    print("Claim:", match["sub_claim"])\n    print(f"Similarity Score: {match[\'similarity_score\']:.4f}")\n\n'

In [None]:
#test_csv = pd.read_csv("/content/drive/MyDrive/riseNLP/testwithResultsRag.csv")

# Results
Having test_csv with all saved results in their corresponding columns, below we compare all the results.

In [None]:
print("Results of zero shot!--------------------------------------")
y_true = test_csv["sub_claim_code"]      # true labels column
y_pred = test_csv["predicted_sub_claim"]   # predicted labels column
calculate_metrics(y_true, y_pred)



print("Results of few random shot!--------------------------------------")
y_true = test_csv["sub_claim_code"]      # true labels column
y_pred = test_csv["predicted_sub_claim_random"]   # predicted labels column
calculate_metrics(y_true, y_pred)


print("Results of few shots with RAG!--------------------------------------")
y_true = test_csv["sub_claim_code"]      # true labels column
y_pred = test_csv["predicted_sub_claim_rag"]   # predicted labels column
calculate_metrics(y_true, y_pred)

Results of zero shot!--------------------------------------
              precision    recall  f1-score   support

         0_0       0.90      0.54      0.68      1753
         1_1       0.07      0.96      0.13        51
         1_2       0.50      0.14      0.22        21
         1_3       0.14      0.60      0.23        30
         1_4       0.15      0.44      0.23        68
         1_6       0.33      0.04      0.07        26
         1_7       0.00      0.00      0.00        64
         2_1       0.54      0.10      0.18       124
         2_3       0.00      0.00      0.00        48
         3_1       0.41      0.27      0.33        26
         3_2       0.00      0.00      0.00        49
         3_3       0.00      0.00      0.00        46
         4_1       0.15      0.78      0.25        64
         4_2       0.20      0.06      0.09        34
         4_4       0.00      0.00      0.00        39
         4_5       0.00      0.00      0.00        36
         5_1       0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
