In [None]:
!pip install optimum auto-gptq

In [None]:
!huggingface-cli login --token ''  

In [None]:
%cd ..

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
import time
from auto_gptq import exllama_set_max_input_length

# Load the model and tokenizer
model_id = "wasifis/Llama-3-8B-4bits"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto",
)
model = exllama_set_max_input_length(model, max_input_length=2088)

# Load the CSV file
df = pd.read_csv('iunput.csv')

# Function to run inference on a prompt
def generate_response(prompt):
    inputs = tokenizer.apply_chat_template(
        [{"role": "system", "content": "You are a helpful medical assistant"},
         {"role": "user", "content": prompt}],
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True,
    ).to("cuda")
    
    outputs = model.generate(**inputs, do_sample=True, max_new_tokens=1000)
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    
    # Extract the response after the word "assistant"
    assistant_keyword = "assistant"
    response = response.split(assistant_keyword, 1)[1].strip()
    
    # Calculate the number of tokens generated
    num_tokens = len(tokenizer.encode(response, add_special_tokens=False))
    
    print(response)
    print(num_tokens)
    return response, num_tokens

# Measure the total inference time
total_start_time = time.time()

# Initialize total tokens counter
total_tokens_generated = 0

# Apply the function to each prompt in the 'summ_prmpt' column
for index, row in df.iterrows():
    response, num_tokens = generate_response(row['summ_prmpt'])
    df.at[index, 'response'] = response
    total_tokens_generated += num_tokens

# Measure the total inference time
total_end_time = time.time()
total_inference_time = total_end_time - total_start_time

# Save the DataFrame to a new Excel file once after all responses are generated
df.to_excel('output.xlsx', index=False)

# Print the total inference time and total tokens generated
print(f"Total Inference Time: {total_inference_time:.2f} seconds")
print(f"Total Tokens Generated: {total_tokens_generated}")

In [None]:
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import torch

model_id = "wasifis/Llama-3-8B-v17"

num_samples = 756
max_seq_len = 4064

tokenizer = AutoTokenizer.from_pretrained(model_id)

examples = [
    tokenizer(
        "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm.",
        return_tensors="pt"
    )
]

quantize_config = BaseQuantizeConfig(
  bits=8,
  group_size=128,
  desc_act=True,
  model_file_base_name="model",
  damp_percent=0.1,
)

# Determine the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and move it to the device
model = AutoGPTQForCausalLM.from_pretrained(
  model_id,
  quantize_config,
  device_map="auto",
).to(device)

# Ensure all tensors in examples are on the same device
for example in examples:
    for key in example:
        example[key] = example[key].to(device)

model.quantize(examples)
model.save_quantized("Meta", use_safetensors=True)
tokenizer.save_pretrained("Meta")

In [None]:
%cd Meta
!huggingface-cli upload wasifis/Llama-3-8B-v17-8bits .

In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_excel('output.xlsx')

# Function to extract text after the word "assistant"
def extract_response(text):
    assistant_keyword = "assistant"
    if assistant_keyword in text:
        return text.split(assistant_keyword, 1)[1].strip()
    return text

# Apply the function to the 'responses' column
df['response'] = df['response'].apply(extract_response)

# Save the modified DataFrame to a new Excel file
df.to_excel('llamav174bitsoutput.xlsx', index=False)

print("Processing complete. The modified Excel file has been saved as 'output.xlsx'.")