In [34]:
!pip install -q transformers==4.31.0 xformers einops accelerate==0.21.0 langchain bitsandbytes==0.40.2 sentencepiece peft==0.4.0 trl==0.4.7 codecarbon

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [35]:
import huggingface_hub
hf_token = "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"
huggingface_hub.login(token=hf_token, add_to_git_credential=True)

Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [36]:
from langchain import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
import warnings
warnings.filterwarnings('ignore')
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [37]:
model = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

In [38]:
pipeline = transformers.pipeline(
    "text-generation", #task
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    max_length=400,
    do_sample=True,
    top_k=10,

    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [39]:
llm = HuggingFacePipeline(pipeline = pipeline, model_kwargs = {'temperature':0})

In [40]:
from langchain import PromptTemplate, LLMChain

template = """ <s> [INST] <<SYS>>
You are a Bengali text annotator. <</SYS>>
Annotate {text} into neutral, negative, or positive.
[/INST] </s>
"""
prompt = PromptTemplate(template=template, input_variables=["text"])
llm_chain = LLMChain(prompt=prompt, llm=llm)


In [41]:
text = "চট্টগ্রাম নগরের জলাবদ্ধতা নিরসনে চলমান চারটি প্রকল্পের কাজ নির্ধারিত সময়ে শেষ হচ্ছে না। "
print(llm_chain(text))

{'text': 'Based on the given text, I would annotate the sentiment of চট্টগ্রাম নগরের জলাবদ্ধতা নিরসনে চলমান চারটি প্রকল্পের কাজ নির্ধারিত সময়ে শেষ হচ್ছে না as follows:\n* Neutral: না (na) - The sentence does not convey any sentiment, it simply states a fact that the condition of the slum in Chottogram is not improving.\nSo, the sentiment of the given sentence is Neutral.'}


In [42]:
text = "মালয়েশিয়ার রাজধানী কুয়ালালামপুরের উপকণ্ঠে মহাসড়কে একটি ছোট আকারের ব্যক্তিগত বিমান বিধ্বস্ত হয়ে ১০ জন নিহত হয়েছেন। স্থানীয় সময় গতকাল বৃহস্পতিবার বিকেলে এ দুর্ঘটনা ঘটে।|"
print(llm_chain(text))

{'text': 'Based on the information provided in the text, I would annotate the tone of the passage as NEUTRAL.\nThe passage simply reports on a factual event without expressing any emotion or opinion. The use of neutral words such as "there were," "had," "were killed," "happened," and "had died" convey a sense of objectivity and detachment, indicating a neutral tone. Additionally, the lack of any emotive language or sensationalism suggests that the author is simply presenting the facts as they are, without attempting to evoke any particular emotional response from the reader.'}


In [43]:
text = "১৯৩১ সালের ১৮ আগস্ট যুক্তরাষ্ট্রের নিউ জার্সি অঙ্গরাজ্যের হেনরি বোসেনবার্গ গোলাপের নতুন একটি জাত আবিষ্কারের জন্য একটি পেটেন্ট পান। "
print(llm_chain(text))

{'text': 'As a Bengali text annotator, I would annotate the text "১৯৩১ সালের ১৮ আগস্ট যুক্তরাষ্ট্রের নিউ জার্সি অঙ্গরাজ্যের হেনরি বোসেনবার্গ গোলাপের নতুন একটি জাত আবিষ্কারের জন্য একটি পেটেন্�'}


In [44]:
text = "অমায়িক ব্যবহার এই রেস্টুরেন্টের স্টাফদের"
print(llm_chain(text))

{'text': 'As a Bengali text annotator, I have analyzed the sentence "অমায়িক ব্যবহার এই রেস্টুরেন্টের স্টাফডের" and classified it into the following categories:\nNeutral:\n* অমায়িক (Amayik) - This word has a neutral connotation, meaning "ordinary" or "usual".\nNegative:\n* ব্যবহার (Bhvbhahar) - This word has a negative connotation, meaning "unnecessary" or "wasteful".\nPositive:\n* রেস্টুরেন্টের (Resturer) - This word has a positive connotation, meaning "restorer" or "healer".\nSo, the overall sentiment of the sentence is neutral.'}


In [45]:
text = "কুস্টিয়ার কোথায় ? সেটা বলেন"
print(llm_chain(text))

{'text': 'As a Bengali text annotator, I would annotate কুস্টিয়ার কোথায় (Kustiya koyta) as having a negative sentiment. The word কুস্টিয়া (kustiya) means "pain" or "sorrow" in Bengali, and the phrase কোথায় (koyta) means "in a condition of" or "with". Therefore, the phrase কুস্টিয়া কোথায় can be interpreted as "in a state of pain" or "with pain". This sentiment is negative, as it implies a state of suffering or discomfort.'}


In [46]:
text = "আমার টুরিস্ট ভিসা রোগীর সাথে যেতে পারবো তাকে দেখাশুনা করার জন্য"
print(llm_chain(text))

{'text': 'As a Bengali text annotator, I would annotate the given sentence as follows:\nContext: The sentence is a statement of intention or purpose, expressing the speaker\'s desire to be with their loved one despite any obstacles or challenges.\nNeutral: "আমার টুরিস্ট ভিসা রোগীর সাথে যেতে পারবো তাকে দেখাশুনা করার জন্য" can be neutral in tone, indicating that the speaker is simply sharing their intentions or plans without any particular emotion or tone. For example, if someone were to say this in a casual conversation, it could be neutral.\nNegative: "আমার টুরিস্ট ভিসা রোগির সাথে যেতে পারবো তাকে দেখাশুنা করার জন্য" could have a negative tone if the speaker is expressing their reluctance or'}


In [47]:
def llama2_sentnob(text):
    return {'llama2_output' : llm_chain(text)}

In [48]:
from datasets import load_dataset
data_files = {'train': 'Train.csv','test':"Test.csv", 'validation':'Val.csv'}
dataset = load_dataset('khondoker/SentNoB',data_files = data_files)
dataset

Downloading and preparing dataset csv/khondoker--SentNoB to /root/.cache/huggingface/datasets/csv/khondoker--SentNoB-18186d812f709a50/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.65M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/334k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/346k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/khondoker--SentNoB-18186d812f709a50/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Data', 'Label'],
        num_rows: 12575
    })
    test: Dataset({
        features: ['Data', 'Label'],
        num_rows: 1586
    })
    validation: Dataset({
        features: ['Data', 'Label'],
        num_rows: 1567
    })
})

In [49]:
def label_to_sentiment(example):
    if example['Label']==0:
        return {'sentiment':'neutral'}
    elif example ['Label']==1:
        return {'sentiment' :'positive'}
    else:
        return {'sentiment':'negative'}

In [50]:
dataset = dataset.map(label_to_sentiment)
dataset

  0%|          | 0/12575 [00:00<?, ?ex/s]

  0%|          | 0/1586 [00:00<?, ?ex/s]

  0%|          | 0/1567 [00:00<?, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['Data', 'Label', 'sentiment'],
        num_rows: 12575
    })
    test: Dataset({
        features: ['Data', 'Label', 'sentiment'],
        num_rows: 1586
    })
    validation: Dataset({
        features: ['Data', 'Label', 'sentiment'],
        num_rows: 1567
    })
})

In [51]:
sample_test_dataset = dataset['test'].select(range(500))
sample_test_dataset

Dataset({
    features: ['Data', 'Label', 'sentiment'],
    num_rows: 500
})

In [52]:
!pip install -q codecarbon
from codecarbon import EmissionsTracker
tracker = EmissionsTracker()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[codecarbon INFO @ 08:37:32] [setup] RAM Tracking...
[codecarbon INFO @ 08:37:32] [setup] GPU Tracking...
[codecarbon INFO @ 08:37:32] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 08:37:32] [setup] CPU Tracking...
[codecarbon INFO @ 08:37:33] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU @ 2.00GHz
[codecarbon INFO @ 08:37:33] >>> Tracker's metadata:
[codecarbon INFO @ 08:37:33]   Platform system: Linux-5.15.120+-x86_64-with-glibc2.35
[codecarbon INFO @ 08:37:33]   Python version: 3.10.12
[codecarbon INFO @ 08:37:33]   CodeCarbon version: 2.3.1
[codecarbon INFO @ 08:37:33]   Available RAM : 15.631 GB
[codecarbon INFO @ 08:37:33]   CPU count: 2
[codecarbon INFO @ 08:37:33]   CPU model: Intel(R) Xeon(R) CPU @ 2.00GHz
[codecarbon INFO @ 08:37:33]   GPU count: 2
[codecarbon INFO @ 08:37:33]   GPU model: 2 x Tesla T4


In [53]:
%%time
tracker.start()
sample_test_dataset = sample_test_dataset.map(llama2_sentnob)
sample_test_dataset
co2_emission = tracker.stop()

  0%|          | 0/500 [00:00<?, ?ex/s]

[codecarbon INFO @ 08:37:51] Energy consumed for RAM : 0.000024 kWh. RAM Power : 5.8618011474609375 W
[codecarbon INFO @ 08:37:51] Energy consumed for all GPUs : 0.000509 kWh. Total GPU Power : 122.06345193920093 W
[codecarbon INFO @ 08:37:51] Energy consumed for all CPUs : 0.000177 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 08:37:51] 0.000711 kWh of electricity used since the beginning.
[codecarbon INFO @ 08:38:06] Energy consumed for RAM : 0.000049 kWh. RAM Power : 5.8618011474609375 W
[codecarbon INFO @ 08:38:06] Energy consumed for all GPUs : 0.001033 kWh. Total GPU Power : 125.89192160626192 W
[codecarbon INFO @ 08:38:06] Energy consumed for all CPUs : 0.000354 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 08:38:06] 0.001436 kWh of electricity used since the beginning.
[codecarbon INFO @ 08:38:21] Energy consumed for RAM : 0.000073 kWh. RAM Power : 5.8618011474609375 W
[codecarbon INFO @ 08:38:21] Energy consumed for all GPUs : 0.001552 kWh. Total GPU Power : 124.80298170

CPU times: user 1h 43min 13s, sys: 11.7 s, total: 1h 43min 25s
Wall time: 1h 43min 37s


In [54]:
sample_test_dataset

Dataset({
    features: ['Data', 'Label', 'sentiment', 'llama2_output'],
    num_rows: 500
})

In [75]:
def check_sentiment(llama2_response):
    keywords = {
        'neutral': int(0),
        'Neutral': int(0),
        'mixed': int(0),
        'Mixed': int(0),
        'positive': int(1),
        'Positive': int(1),
        'negative': int(2),
        'Negative': int(2),
        'offensive': int(2),
        'Offensive': int(2),

    }
    positions = {keyword: str(llama2_response['llama2_output']).find(keyword) for keyword in keywords}
    valid_positions = {key: pos for key, pos in positions.items() if pos != -1}
    if not valid_positions:
        return {'predicted_label': int(4)}
    first_keyword = min(valid_positions, key=valid_positions.get)
    return {'predicted_label': (keywords[first_keyword])}

In [76]:
sample_test_dataset = sample_test_dataset.map(check_sentiment)
sample_test_dataset

  0%|          | 0/500 [00:00<?, ?ex/s]

Dataset({
    features: ['Data', 'Label', 'sentiment', 'llama2_output', 'predicted_label'],
    num_rows: 500
})

In [77]:

sample_test_dataset.to_csv('Llama 2 on 100 Sentnob test data.csv')
#co2_emission.to_csv('CO2 emissions.csv')
sample_test_dataset

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['Data', 'Label', 'sentiment', 'llama2_output', 'predicted_label'],
    num_rows: 500
})

In [78]:
print(predicted_label.shape)
print(type(predicted_label))


(1586,)
<class 'pandas.core.series.Series'>


In [79]:
predictions= sample_test_dataset['predicted_label']
ground_truth_labels= sample_test_dataset['Label']

In [80]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report

accuracy = accuracy_score(ground_truth_labels, predictions)
f1_macro = f1_score(ground_truth_labels, predictions, average='macro')
f1_micro = f1_score(ground_truth_labels, predictions, average='micro')

precision = precision_score(ground_truth_labels, predictions, average="weighted")
recall = recall_score(ground_truth_labels, predictions, average="weighted")
class_report = classification_report(ground_truth_labels, predictions)
conf_matrix = confusion_matrix(ground_truth_labels, predictions)

print("Accuracy:", accuracy)
print("F1 Macro:", f1_macro)
print("F1 Micro:", f1_micro)
print('Precision:', precision)
print("Recall:", recall)
print("Classification Report:\n", class_report)
print("Confusion Matrix: \n", conf_matrix)
print('Total CO2 emissions in grams: \n',co2_emission*1000)

Accuracy: 0.362
F1 Macro: 0.27405001604209095
F1 Micro: 0.362
Precision: 0.7011059989567031
Recall: 0.362
Classification Report:
               precision    recall  f1-score   support

           0       0.28      0.85      0.42       116
           1       0.87      0.22      0.35       211
           2       0.78      0.20      0.32       173
           4       0.00      0.00      0.00         0

    accuracy                           0.36       500
   macro avg       0.48      0.32      0.27       500
weighted avg       0.70      0.36      0.36       500

Confusion Matrix: 
 [[ 99   4   6   7]
 [138  47   4  22]
 [118   3  35  17]
 [  0   0   0   0]]
Total CO2 emissions in grams: 
 135.16559827660782
