In [None]:
!pip install -q accelerate bitsandbytes rich transformers --progress-bar off

In [None]:
import torch
from transformers import pipeline
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from transformers import BitsAndBytesConfig

In [None]:
import pickle

In [None]:
experiment_name = "2024.05.09-01_Zephyr-7b-beta_4bit_zero-shot"

In [None]:
from google.colab import drive
from IPython.display import Image, display

mount_point = "/content/gdrive"
base_path = mount_point + "/MyDrive/Colab/SDG/data"

input_data_path = base_path + "/input/sdg_17_labels_classification_dataset_1020_texts_TEST_2023.12.11.xlsx"
#input_data_path_train = base_path + "/input/sdg_17_labels_classification_dataset_4760_texts_TRAIN_2023.12.11.xlsx"

#top7_train_data_sim_to_each_test_data_filename = base_path + "/input/top7_train_data_sim_to_each_test_data_but_with_different_classes_with_2023.12.11_input_data.pickle"

output_data_path = base_path + "/../Mistral/data/output/" + experiment_name
log_filename = base_path + "/../Mistral/logs/" + experiment_name + ".log"

drive.mount(mount_point, force_remount=True)

Mounted at /content/gdrive


In [None]:
pretrained_model_name_or_path = "HuggingFaceH4/zephyr-7b-beta"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    quantization_config=quantization_config
)

tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=pretrained_model_name_or_path
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [None]:
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

In [None]:
# @title
import pandas as pd
data = pd.read_excel(input_data_path)

## Run the model

In [None]:
def generate_query_prompt(query: str) -> str:
  """Generate LLM prompt for user query."""
  prompt = (
      "<|system|>\n"
      "You are a text classifier.</s>\n"
      "<|user|>\n"
      f"{query}.</s>\n"
      "<|assistant|>\n"
  )
  return prompt

In [None]:
data_sel = data

In [None]:
text_id = data_sel.iloc[0].text_id
text = data_sel.iloc[0].text.replace('\n','').replace('{','(').replace('}',')')
sdg_true = data_sel.iloc[0].sdg

print("EXPECTED_SDG:", sdg_true, "\nTEXT:", text.replace('\n',''),'\n\n')

template = f"""Classify the following input text within triple quotes according to the following Sustainable Development Goals (SDGs) dictionary of labels:

"SDG-1": "End poverty in all its forms everywhere."
"SDG-2": "End hunger, achieve food security and improved nutrition and promote sustainable agriculture."
"SDG-3": "Ensure healthy lives and promote well-being for all at all ages."
"SDG-4": "Ensure inclusive and equitable quality education and promote lifelong learning opportunities for all."
"SDG-5": "Achieve gender equality and empower all women and girls."
"SDG-6": "Ensure availability and sustainable management of water and sanitation for all."
"SDG-7": "Ensure access to affordable, reliable, sustainable and modern energy for all."
"SDG-8": "Promote sustained, inclusive and sustainable economic growth, full and productive employment and decent work for all."
"SDG-9": "Build resilient infrastructure, promote inclusive and sustainable industrialization and foster innovation."
"SDG-10": "Reduce inequality within and among countries."
"SDG-11": "Make cities and human settlements inclusive, safe, resilient and sustainable."
"SDG-12": "Ensure sustainable consumption and production patterns."
"SDG-13": "Take urgent action to combat climate change and its impacts."
"SDG-14": "Conserve and sustainably use the oceans, seas and marine resources for sustainable development."
"SDG-15": "Protect, restore and promote sustainable use of terrestrial ecosystems, sustainably manage forests, combat desertification, halt and reverse land degradation, and halt biodiversity loss."
"SDG-16": "Promote peaceful and inclusive societies for sustainable development, provide access to justice for all and build effective, accountable and inclusive institutions at all levels."
"SDG-0": "Other."

Choose ONLY ONE label for each input text.

DO NOT inlcude the input text in your answer.

The input text is:
'''
{text}
'''
"""

prompt = generate_query_prompt(query=template)
print(prompt)

EXPECTED_SDG: 0 
TEXT: L train resumes service after derailing from track in Canarsie, Brooklyn  WABC-TVService restored after subway train derails in Brooklyn: officials  PIX11 New York NewsNo injuries in Brooklyn subway derailment  New York Daily NewsView Full Coverage on Google News. 


<|system|>
You are a text classifier.</s>
<|user|>
Classify the following input text within triple quotes according to the following Sustainable Development Goals (SDGs) dictionary of labels:

"SDG-1": "End poverty in all its forms everywhere."
"SDG-2": "End hunger, achieve food security and improved nutrition and promote sustainable agriculture."
"SDG-3": "Ensure healthy lives and promote well-being for all at all ages."
"SDG-4": "Ensure inclusive and equitable quality education and promote lifelong learning opportunities for all."
"SDG-5": "Achieve gender equality and empower all women and girls."
"SDG-6": "Ensure availability and sustainable management of water and sanitation for all."
"SDG-7": "E

In [None]:
output = pipe(text_inputs=prompt, max_new_tokens=1024, temperature=0.0, return_full_text=False)
output



[{'generated_text': 'SDG-0: "Other."\n\nThe input text does not directly relate to any of the SDGs listed. Therefore, the appropriate label is "Other."'}]

In [None]:
data_sel = data.iloc[785:]

In [None]:
results = []

for index, row in data_sel.iterrows():

  cnt = index

  text_id = row['text_id']
  text = row["text"].replace('\n','').replace('{','(').replace('}',')')
  sdg_true = row["sdg"]

  file = open(log_filename, 'a+', encoding='utf-8')

  print("\n\n##################################################")
  file.write("\n\n##################################################\n")

  print("EXPECTED_SDG:", row["sdg"], "TEXT:", row["text"].replace('\n',''),'\n\n')
  file.write(f"EXPECTED_SDG: {row['sdg']} TEXT: " + row['text'].replace('\n','') + "\n\n\n")

  template = f"""Classify the following input text within triple quotes according to the following Sustainable Development Goals (SDGs) dictionary of labels:

"SDG-1": "End poverty in all its forms everywhere."
"SDG-2": "End hunger, achieve food security and improved nutrition and promote sustainable agriculture."
"SDG-3": "Ensure healthy lives and promote well-being for all at all ages."
"SDG-4": "Ensure inclusive and equitable quality education and promote lifelong learning opportunities for all."
"SDG-5": "Achieve gender equality and empower all women and girls."
"SDG-6": "Ensure availability and sustainable management of water and sanitation for all."
"SDG-7": "Ensure access to affordable, reliable, sustainable and modern energy for all."
"SDG-8": "Promote sustained, inclusive and sustainable economic growth, full and productive employment and decent work for all."
"SDG-9": "Build resilient infrastructure, promote inclusive and sustainable industrialization and foster innovation."
"SDG-10": "Reduce inequality within and among countries."
"SDG-11": "Make cities and human settlements inclusive, safe, resilient and sustainable."
"SDG-12": "Ensure sustainable consumption and production patterns."
"SDG-13": "Take urgent action to combat climate change and its impacts."
"SDG-14": "Conserve and sustainably use the oceans, seas and marine resources for sustainable development."
"SDG-15": "Protect, restore and promote sustainable use of terrestrial ecosystems, sustainably manage forests, combat desertification, halt and reverse land degradation, and halt biodiversity loss."
"SDG-16": "Promote peaceful and inclusive societies for sustainable development, provide access to justice for all and build effective, accountable and inclusive institutions at all levels."
"SDG-0": "Other."

Choose ONLY ONE label for each input text.

DO NOT inlcude the input text in your answer.

The input text is:
'''
{text}
'''
"""

  prompt = generate_query_prompt(query=template)

  output = pipe(text_inputs=prompt, max_new_tokens=1024, temperature=0.0, return_full_text=False)

  result_extended = (cnt, text_id, sdg_true, output)

  print(result_extended)
  file.write(str(result_extended)+'\n')

  with open(output_data_path+f'/sdg_classification_{cnt}.pkl', 'wb') as f:
      pickle.dump(result_extended, f)

  file.close()




##################################################
EXPECTED_SDG: 13 TEXT: The Conference of the Parties has mandated the Subsidiary Body for Science and Technological Advice (SBSTA) to develop modalities by November 2018 for accounting of financial resources provided and mobilised through public interventions (Decision 1/CP.21, paragraph 58). In the coming years SBSTA and the Ad Hoc Working Group on the Paris Agreement will develop the modalities, procedures and guidelines to enable greater transparency of both climate action and support. This will include tracking the USD 100 billion / year commitment and the additional resources that will be needed to fulfil the ambition of the Paris Agreement. 


(785, 2312, 13, [{'generated_text': 'The input text relates to SDG-13, which is "Take urgent action to combat climate change and its impacts." The text mentions the development of modalities for accounting of financial resources provided and mobilized through public interventions, which i

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


(791, 4001, 13, [{'generated_text': '"SDG-13": "Take urgent action to combat climate change and its impacts."'}])


##################################################
EXPECTED_SDG: 13 TEXT: In general, as economic development levels increase and/or the domestic financial systems mature, finance tends to be delivered through multilateral channels. However, a range of other factors also affect delivery of climate finance, such as geopolitical interests and historical relationship with finance providers. Given the data present a snapshot of 2013 and 2014, a few large-scale projects committed during the period could distort the overall picture. For instance, Armenia (e.g. infrastructure in energy, water and agriculture sectors supported by Germany) and Uzbekistan (e.g. large-scale power plants supported by Japan) receive a large portion of finance through bilateral channels. 


(792, 2228, 13, [{'generated_text': 'The input text relates to SDG-7 (Ensure access to affordable, reliable, sust

In [None]:
data_sel = data #.iloc[:3]

In [None]:
import glob

files = glob.glob(output_data_path + "/*.pkl")

results = []
for file in files:
    res = pd.read_pickle(file)

    # try:
    #     if len(res[3]['choices'])>1:
    #         raise Exception('ERROR: Multilabel detected!')
    #         break
    #     sdg_pred = res[3]['choices'][0]['message']['content']
    #     res4df = (res[0], res[1], res[2], sdg_pred)
    #     print(res4df)
    #     tuple_list.append(res4df)
    # except:
    #     print("#################################################################")
    #     print(res)
    #     print("#################################################################")

    # print(res)
    results.append(res)
    # break

len(results)

1020

In [None]:
results_processed = pd.DataFrame(results, columns=['cnt', 'text_id', 'sdg_true', 'result'])
results_processed.head(3)

Unnamed: 0,cnt,text_id,sdg_true,result
0,0,6668,0,"[{'generated_text': 'SDG-0: ""Other."" The inpu..."
1,1,6099,0,"[{'generated_text': 'SDG-0: ""Other."" The inp..."
2,2,6471,0,[{'generated_text': 'SDG-16: Promote peaceful ...


In [None]:
results_processed.iloc[0].result

[{'generated_text': 'SDG-0: "Other."\n\nThe input text does not directly relate to any of the SDGs listed. Therefore, the appropriate label is "Other."'}]

In [None]:
#results_processed.iloc[0].result.split("Answer with label only usign the format:---LABEL---")[1]
# results_processed['result_short'] = results_processed['result'].apply(lambda x: x.split("Answer with label only usign the format:---LABEL---")[1])

In [None]:
# results_processed['result']

In [None]:
# results_processed['label'] = results_processed['result'].str.extract(r'SDG-(\d+):')
# results_processed['label'] = results_processed['result_short'].str.extract(r'SDG (\d+)')

results_processed['label'] = results_processed['result'].apply(
    lambda x: pd.Series(x[0]['generated_text']).str.extract(r'SDG-(\d+)')[0]
)

results_processed.head(3)

Unnamed: 0,cnt,text_id,sdg_true,result,label
0,0,6668,0,"[{'generated_text': 'SDG-0: ""Other."" The inpu...",0
1,1,6099,0,"[{'generated_text': 'SDG-0: ""Other."" The inp...",0
2,2,6471,0,[{'generated_text': 'SDG-16: Promote peaceful ...,16


In [None]:
final = data_sel.merge(results_processed, left_on='text_id', right_on='text_id')
final.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text_id,doi,text,sdg,labels_negative,labels_positive,agreement,text_length,cnt,sdg_true,result,label
0,0,0,6668,,L train resumes service after derailing from t...,0,,,,38,0,0,"[{'generated_text': 'SDG-0: ""Other."" The inpu...",0
1,1,1,6099,,Brooklyn Decker’s SI Swimsuit Cover Photo Shoo...,0,,,,15,1,0,"[{'generated_text': 'SDG-0: ""Other."" The inp...",0
2,2,2,6471,,Mumbai: The Bombay High Court has suspended th...,0,,,,125,2,0,[{'generated_text': 'SDG-16: Promote peaceful ...,16


In [None]:
text_ids_with_errors = final[final.label.isna()].text_id.to_list()
list(set(text_ids_with_errors))

[2191]

In [None]:
final  = final[~final.label.isna()]

In [None]:
import numpy as np
from sklearn.metrics import classification_report

y_true = final.sdg
y_pred = final.label
y_true = np.array([int(label) for label in y_true])
y_pred = np.array([int(label) for label in y_pred])
print(y_true.shape[0])
print(y_pred.shape[0])
#target_names = ['sdg 0', 'sdg 1', 'sdg 2',  'sdg 3']

print(classification_report(y_true, y_pred)) #, target_names=target_names))

1019
1019
              precision    recall  f1-score   support

           0       0.35      0.80      0.49        59
           1       0.62      0.83      0.71        60
           2       0.88      0.60      0.71        60
           3       0.74      0.75      0.74        60
           4       0.67      0.83      0.74        60
           5       0.83      0.90      0.86        60
           6       0.96      0.78      0.86        60
           7       0.81      0.80      0.81        60
           8       0.40      0.55      0.46        60
           9       0.57      0.55      0.56        60
          10       0.57      0.27      0.36        60
          11       0.85      0.48      0.62        60
          12       0.83      0.75      0.79        60
          13       0.67      0.77      0.71        60
          14       0.95      0.67      0.78        60
          15       0.88      0.82      0.84        60
          16       0.68      0.32      0.43        60
          17     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# classification_report_filename = output_data_path+f'/{experiment_name}_classification_report.xlsx'
# classification_report_filename

In [None]:
# report=classification_report(y_true, y_pred, output_dict=True)
# report_df = pd.DataFrame(report).transpose()
# report_df.to_excel(classification_report_filename)

In [None]:
# test = pd.read_excel(classification_report_filename)
# test

In [None]:
results_processed_merged = final

In [None]:
# CORREZIONI NEL CASO IN CUI IL MODELLO RESTITUISCA DUPLICATI O OUTPUT NON UTILI PER DEFINIRE LA CLASSE O CLASSI NON PREVISTE

import random

# NEL CASO IN CUI IL MODELLO RESTITUISCA DUPLICATI SELEZIONO SEMPRE IL PRIMO
results_processed_merged = results_processed_merged.drop_duplicates(subset='text_id', keep='first')

# # NEL CASO IN CUI IL MODELLO RESTITUISCA OUTPUT NON UTILI PER DEFINIRE LA CLASSE SI APPLICA LA CLASSE 0
# # results_processed_merged.loc[results_processed_merged['predicted_label_clean'].isna(), 'predicted_label_clean'] = 0
# results_processed_merged.loc[results_processed_merged['predicted_label_clean'].isna(), 'predicted_label_clean'] = results_processed_merged.loc[results_processed_merged['predicted_label_clean'].isna(), 'sdg'] - 1

# # NEL CASO IN CUI IL MODELLO RESTITUISCA CLASSI NON PREVISTE SI APPLICA LA CLASSE 0
# results_processed_merged['predicted_label_clean'] = results_processed_merged.predicted_label_clean.apply(lambda x: 0 if int(x)>16 else x)

# Lista di valori sdg_list
sdg_list = [str(x) for x in range(17)]
# sdg_list = [x for x in range(17)]
# print(sdg_list)

# Se il valore di 'predicted_label_clean' e' NaN o un valore numerico non presente nella lista sdg_list
# allora assegna a  'predicted_label_clean' un valore qualunque estratto a caso da sdg_list ma diverso dal valore di 'sdg' per la riga in esame
# Modifica della colonna 'predicted_label_clean' secondo le regole specificate
for index, row in results_processed_merged.iterrows():
    if pd.isna(row['label']) or row['label'] not in sdg_list:
#         new_value = random.choice([x for x in sdg_list if x != row['sdg']])
        results_processed_merged.at[index, 'label'] = 0  # new_value

In [None]:
results_processed_merged.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text_id,doi,text,sdg,labels_negative,labels_positive,agreement,text_length,cnt,sdg_true,result,label
0,0,0,6668,,L train resumes service after derailing from t...,0,,,,38,0,0,"[{'generated_text': 'SDG-0: ""Other."" The inpu...",0
1,1,1,6099,,Brooklyn Decker’s SI Swimsuit Cover Photo Shoo...,0,,,,15,1,0,"[{'generated_text': 'SDG-0: ""Other."" The inp...",0
2,2,2,6471,,Mumbai: The Bombay High Court has suspended th...,0,,,,125,2,0,[{'generated_text': 'SDG-16: Promote peaceful ...,16


In [None]:
import numpy as np
from sklearn.metrics import classification_report

y_true = results_processed_merged.sdg
y_pred = results_processed_merged.label
y_true = np.array([int(label) for label in y_true])
y_pred = np.array([int(label) for label in y_pred])
print(y_true.shape[0])
print(y_pred.shape[0])
#target_names = ['sdg 0', 'sdg 1', 'sdg 2',  'sdg 3']

print(classification_report(y_true, y_pred)) #, target_names=target_names))

1019
1019
              precision    recall  f1-score   support

           0       0.35      0.80      0.48        59
           1       0.62      0.83      0.71        60
           2       0.88      0.60      0.71        60
           3       0.74      0.75      0.74        60
           4       0.67      0.83      0.74        60
           5       0.83      0.90      0.86        60
           6       0.96      0.78      0.86        60
           7       0.81      0.80      0.81        60
           8       0.40      0.55      0.46        60
           9       0.57      0.55      0.56        60
          10       0.57      0.27      0.36        60
          11       0.85      0.48      0.62        60
          12       0.83      0.75      0.79        60
          13       0.67      0.77      0.71        60
          14       0.95      0.67      0.78        60
          15       0.88      0.82      0.84        60
          16       0.68      0.32      0.43        60

    accuracy    

In [None]:
output_datapath = output_data_path

In [None]:
classification_report_filename = output_datapath+f'/{experiment_name}_classification_report.xlsx'
classification_report_filename

report=classification_report(y_true, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df[['precision', 'recall', 'f1-score']] *= 100
report_df.to_excel(classification_report_filename)

In [None]:
report_df

Unnamed: 0,precision,recall,f1-score,support
0,34.558824,79.661017,48.205128,59.0
1,61.728395,83.333333,70.921986,60.0
2,87.804878,60.0,71.287129,60.0
3,73.770492,75.0,74.380165,60.0
4,66.666667,83.333333,74.074074,60.0
5,83.076923,90.0,86.4,60.0
6,95.918367,78.333333,86.238532,60.0
7,81.355932,80.0,80.672269,60.0
8,39.759036,55.0,46.153846,60.0
9,56.896552,55.0,55.932203,60.0


In [None]:
accuracy = report_df.loc['accuracy']['precision']
precision = report_df.loc['macro avg']['precision']
recall = report_df.loc['macro avg']['recall']
f1_score = report_df.loc['macro avg']['f1-score']

In [None]:
print(precision, recall, accuracy, f1_score)

72.0334281465287 67.43104021269525 67.4190382728165 67.57013199478938


In [None]:
print(round(precision, 1), round(recall, 1), round(accuracy, 1), round(f1_score, 1))

72.0 67.4 67.4 67.6
