# ***MicroSoft Phi 3 Evaluation***

In [None]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 

PATH = "/kaggle/input/reasons/H.E. Llama2_7B (FT using L).xlsx"
if(torch.cuda.is_available()):
    device = 'cuda'

try:
    dataset = pd.read_excel(PATH)  # Adjust the delimiter if needed
except pd.errors.ParserError as e:
    print(f"Error parsing CSV: {e}")

import re

def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@[^\n]+", "", text)
    text = re.sub(r"<s>+", " ", text)
    text = re.sub(r"\n+", " ", text)
    return re.sub(r"\^[^ ]+", "", text)

dataset['Generated Reason'] = dataset['Generated Reason'].apply(clean_text)


# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")



# Set random seed for reproducibility


# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

def summarize(text: str):
    inputs = tokenizer(text, return_token_type_ids=False,return_tensors="pt").to("cuda")

    with torch.inference_mode():
        outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.7)

#         answer_tokens = outputs[:, inputs.input_ids.shape[1] :]
        decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        print(decoded_output)

    return decoded_output

scale0=scale1=scale2=scale3=scale4=scale5=0
total = len(dataset['Statement'])
rows = []


for i in range(len(dataset['Statement'])):
    text = f"### Question: Evaluate the quality of the following generated reason based on the Statement on a scale of 0-5, where ratings means: 0: Unusable - The response is irrelevant, nonsensical, or doesn't address the statement at all. 1: Poor - The response may contain some relevant information, but it's inaccurate, misleading, or poorly formatted. 2: Below Average - The response partially addresses the statement, but it lacks clarity, coherence, or sufficient detail. 3: Average - The response provides a general answer to the statement, but it could be improved with additional information or better organization. 4: Good - The response clearly and accurately addresses the statement, demonstrating a good understanding of the topic. 5: Excellent - The response is exceptional, going beyond the basic requirements to provide insightful or creative content. Given Statement: {dataset['Statement'][i]} Generated reason :{dataset['Generated Reason'][i]}. ###The rating for generated reason based on the Statement is: "
    pred = []
    processed_output = summarize(text)
    
    output = ""
    for item in processed_output:
        if '###The rating for generated reason based on the Statement is:' in item:
            output = item.split('###The rating for generated reason based on the Statement is:')[1].strip()
            break
    print(output)

    if 'unusable' in output:
      scale0+=1
      result = 'Unusable'
    elif 'poor' in output:
      scale1+=1
      result = 'Poor'
    elif 'Below Average' in output:
      scale1+=1
      result2 = 'Below Average'
    elif 'Average' in output:
      scale3+=1
      result = 'Average'
    elif 'Good' in output:
      scale4+=1
      result = 'Good'
    else: 
      scale5+=1
      result = 'Excellent'
    
    rows.append((dataset['Statement'][i],dataset['Generated Reason'], output, result))


print(f"scale0: {scale0}\nscale1: {scale1}\nscale2: {scale2}\nscale3: {scale3}\nscale4: {scale4}\nscale5: {scale5}\nAverage_scale0:{scale0/total}\nAverage_scale1:{scale1/total}\nAverage_scale2:{scale2/total}\n\nAverage_scale3:{scale3/total}\nAverage_scale4:{scale4/total}\n\nAverage_scale5:{scale5/total}")
df = pd.DataFrame(rows, columns=['Statement', 'Generated Reason', 'Output', 'Classified as'])

df.to_excel("Llama Phi3 Ourdataset L.xlsx", index = False)

# ***GPT 2 Evaluation***

In [None]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, GPT2Tokenizer, GPT2Model, GPT2LMHeadModel

PATH = "/kaggle/input/remaining-ones/Overall_ Mistral H.E. OurDataset (LR).xlsx"
if(torch.cuda.is_available()):
    device = 'cuda'

try:
    dataset = pd.read_excel(PATH)  # Adjust the delimiter if needed
except pd.errors.ParserError as e:
    print(f"Error parsing CSV: {e}")

import re

def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@[^\n]+", "", text)
    text = re.sub(r"<s>+", " ", text)
    text = re.sub(r"\n+", " ", text)
    return re.sub(r"\^[^ ]+", "", text)

dataset['Generated Reason'] = dataset['Generated Reason'].apply(clean_text)


# Load the model and tokenizer
model = GPT2LMHeadModel.from_pretrained(
    "gpt2", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True
)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")



# Set random seed for reproducibility



def summarize(text: str):
    inputs = tokenizer(text, return_token_type_ids=False,return_tensors="pt").to("cuda")

    with torch.inference_mode():
        outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.7)

#         answer_tokens = outputs[:, inputs.input_ids.shape[1] :]
        decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        print(decoded_output)

    return decoded_output

scale0=scale1=scale2=scale3=scale4=scale5=0
total = len(dataset['Statement'])
rows = []


for i in range(len(dataset['Statement'])):
    text = f"### Question: Evaluate the quality of the following generated reason based on the Statement on a scale of 0-5, where ratings means: 0: Unusable - The response is irrelevant, nonsensical, or doesn't address the statement at all. 1: Poor - The response may contain some relevant information, but it's inaccurate, misleading, or poorly formatted. 2: Below Average - The response partially addresses the statement, but it lacks clarity, coherence, or sufficient detail. 3: Average - The response provides a general answer to the statement, but it could be improved with additional information or better organization. 4: Good - The response clearly and accurately addresses the statement, demonstrating a good understanding of the topic. 5: Excellent - The response is exceptional, going beyond the basic requirements to provide insightful or creative content. Given Statement: {dataset['Statement'][i]} Generated reason :{dataset['Generated Reason'][i]}. ###The rating for generated reason based on the Statement is: "
    pred = []
    processed_output = summarize(text)
    
    output = ""
    for item in processed_output:
        if '###The rating for generated reason based on the Statement is:' in item:
            output = item.split('###The rating for generated reason based on the Statement is:')[1].strip()
            break
    print(output)

    if 'unusable' in output:
      scale0+=1
      result = 'Unusable'
    elif 'poor' in output:
      scale1+=1
      result = 'Poor'
    elif 'Below Average' in output:
      scale1+=1
      result2 = 'Below Average'
    elif 'Average' in output:
      scale3+=1
      result = 'Average'
    elif 'Good' in output:
      scale4+=1
      result = 'Good'
    else: 
      scale5+=1
      result = 'Excellent'
    
    rows.append((dataset['Statement'][i],dataset['Generated Reason'], output, result))


print(f"scale0: {scale0}\nscale1: {scale1}\nscale2: {scale2}\nscale3: {scale3}\nscale4: {scale4}\nscale5: {scale5}\nAverage_scale0:{scale0/total}\nAverage_scale1:{scale1/total}\nAverage_scale2:{scale2/total}\n\nAverage_scale3:{scale3/total}\nAverage_scale4:{scale4/total}\n\nAverage_scale5:{scale5/total}")
df = pd.DataFrame(rows, columns=['Statement', 'Generated Reason', 'Output', 'Classified as'])

df.to_excel("Mistral GPT 2 Ourdataset L+R.xlsx", index = False)

# ***Gemma Evaluation***

In [None]:
!pip install huggingface-hub>=0.17.1
!huggingface-cli login --token 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

In [None]:
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [None]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 

PATH = "/kaggle/input/llama-reasons/Overall Untrained Llama2 OurDataset.xlsx"
if(torch.cuda.is_available()):
    device = 'cuda'

try:
    dataset = pd.read_excel(PATH)  # Adjust the delimiter if needed
except pd.errors.ParserError as e:
    print(f"Error parsing CSV: {e}")

import re

def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@[^\n]+", "", text)
    text = re.sub(r"<s>+", " ", text)
    text = re.sub(r"\n+", " ", text)
    return re.sub(r"\^[^ ]+", "", text)

dataset['Generated Reason'] = dataset['Generated Reason'].apply(clean_text)


# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2b", 
    device_map="cuda", 
    torch_dtype=torch.bfloat16, 
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")




def summarize(text: str):
    inputs = tokenizer(text, return_token_type_ids=False,return_tensors="pt").to("cuda")

    with torch.inference_mode():
        outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.7)

#         answer_tokens = outputs[:, inputs.input_ids.shape[1] :]
        decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        print(decoded_output)

    return decoded_output

scale0=scale1=scale2=scale3=scale4=scale5=0
total = len(dataset['Statement'])
rows = []


for i in range(len(dataset['Statement'])):
    text = f"### Question: Evaluate the quality of the following Generated Reason based on the Statement on a scale of 0-5, where ratings means: 0: Unusable - The response is irrelevant, nonsensical, or doesn't address the statement at all. 1: Poor - The response may contain some relevant information, but it's inaccurate, misleading, or poorly formatted. 2: Below Average - The response partially addresses the statement, but it lacks clarity, coherence, or sufficient detail. 3: Average - The response provides a general answer to the statement, but it could be improved with additional information or better organization. 4: Good - The response clearly and accurately addresses the statement, demonstrating a good understanding of the topic. 5: Excellent - The response is exceptional, going beyond the basic requirements to provide insightful or creative content. Given Statement: {dataset['Statement'][i]} Generated reason :{dataset['Generated Reason'][i]}. ###The rating for generated reason based on the Statement is: "
    pred = []
    processed_output = summarize(text)
    
    output = ""
    for item in processed_output:
        if '###The rating for Generated Reason based on the Statement is:' in item:
            output = item.split('###The rating for Generated Reason based on the Statement is:')[1].strip()
            break
    print(output)

    if 'unusable' in output:
      scale0+=1
      result = 'Unusable'
    elif 'poor' in output:
      scale1+=1
      result = 'Poor'
    elif 'Below Average' in output:
      scale1+=1
      result2 = 'Below Average'
    elif 'Average' in output:
      scale3+=1
      result = 'Average'
    elif 'Good' in output:
      scale4+=1
      result = 'Good'
    else: 
      scale5+=1
      result = 'Excellent'
    
    rows.append((dataset['Statement'][i],dataset['Generated Reason'], output, result))


print(f"scale0: {scale0}\nscale1: {scale1}\nscale2: {scale2}\nscale3: {scale3}\nscale4: {scale4}\nscale5: {scale5}\nAverage_scale0:{scale0/total}\nAverage_scale1:{scale1/total}\nAverage_scale2:{scale2/total}\n\nAverage_scale3:{scale3/total}\nAverage_scale4:{scale4/total}\n\nAverage_scale5:{scale5/total}")
df = pd.DataFrame(rows, columns=['Statement', 'Generated Reason', 'Output', 'Classified as'])

df.to_excel("Llama Gemma-2b Ourdataset Untrained.xlsx", index = False)

# ***Zephyr-3b***

In [1]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

PATH = "/kaggle/input/llama-reasons/Overall Untrained Llama2 OurDataset.xlsx"
if(torch.cuda.is_available()):
    device = 'cuda'

try:
    dataset = pd.read_excel(PATH)  # Adjust the delimiter if needed
except pd.errors.ParserError as e:
    print(f"Error parsing CSV: {e}")

import re

def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@[^\n]+", "", text)
    text = re.sub(r"<s>+", " ", text)
    text = re.sub(r"\n+", " ", text)
    return re.sub(r"\^[^ ]+", "", text)

dataset['Generated Reason'] = dataset['Generated Reason'].apply(clean_text)


# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "stabilityai/stablelm-zephyr-3b", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-zephyr-3b")



def summarize(text: str):
    inputs = tokenizer(text, return_token_type_ids=False,return_tensors="pt").to("cuda")

    with torch.inference_mode():
        outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.7)

#         answer_tokens = outputs[:, inputs.input_ids.shape[1] :]
        decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        print(decoded_output)

    return decoded_output

scale0=scale1=scale2=scale3=scale4=scale5=0
total = len(dataset['Statement'])
rows = []


for i in range(len(dataset['Statement'])):
    text = f"### Question: Evaluate the quality of the following generated reason based on the Statement on a scale of 0-5, where ratings means: 0: Unusable - The response is irrelevant, nonsensical, or doesn't address the statement at all. 1: Poor - The response may contain some relevant information, but it's inaccurate, misleading, or poorly formatted. 2: Below Average - The response partially addresses the statement, but it lacks clarity, coherence, or sufficient detail. 3: Average - The response provides a general answer to the statement, but it could be improved with additional information or better organization. 4: Good - The response clearly and accurately addresses the statement, demonstrating a good understanding of the topic. 5: Excellent - The response is exceptional, going beyond the basic requirements to provide insightful or creative content. Given Statement: {dataset['Statement'][i]} Generated reason :{dataset['Generated Reason'][i]}. ###The rating for generated reason based on the Statement is: "
    pred = []
    processed_output = summarize(text)
    
    output = ""
    for item in processed_output:
        if '###The rating for generated reason based on the Statement is:' in item:
            output = item.split('###The rating for generated reason based on the Statement is:')[1].strip()
            break
    print(output)

    if 'unusable' in output:
      scale0+=1
      result = 'Unusable'
    elif 'poor' in output:
      scale1+=1
      result = 'Poor'
    elif 'Below Average' in output:
      scale1+=1
      result2 = 'Below Average'
    elif 'Average' in output:
      scale3+=1
      result = 'Average'
    elif 'Good' in output:
      scale4+=1
      result = 'Good'
    else: 
      scale5+=1
      result = 'Excellent'
    
    rows.append((dataset['Statement'][i],dataset['Generated Reason'], output, result))


print(f"scale0: {scale0}\nscale1: {scale1}\nscale2: {scale2}\nscale3: {scale3}\nscale4: {scale4}\nscale5: {scale5}\nAverage_scale0:{scale0/total}\nAverage_scale1:{scale1/total}\nAverage_scale2:{scale2/total}\n\nAverage_scale3:{scale3/total}\nAverage_scale4:{scale4/total}\n\nAverage_scale5:{scale5/total}")
df = pd.DataFrame(rows, columns=['Statement', 'Generated Reason', 'Output', 'Classified as'])

df.to_excel("Llama Zephyr-3b Untrained.xlsx", index = False)

2024-05-11 11:25:58.097584: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-11 11:25:58.097722: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-11 11:25:58.236150: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.59G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/5.21k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


RuntimeError: cutlassF: no kernel found to launch!