# 0. Install required libraries and define constants

In [None]:
pip install wheel packaging ninja pandas

In [None]:
pip install flash-attn

In [None]:
%env HF_HOME=/cluster/user/nimeseth/.cache
%env HF_DATASETS_CACHE=/cluster/user/nimeseth/datasets
%env TOKENIZERS_PARALLELISM=False

In [None]:
!nvidia-smi

# 1. Download and install LLM (Gemma-2B / Gemma 7B / Phi-3) and setup the model
Change the variable `model_id` to change the model to download and run in this notebook.

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

#model_id = "microsoft/Phi-3-mini-128k-instruct"
model_id = "microsoft/Phi-3-medium-128k-instruct"
#model_id = "google/gemma-7b-it"
#model_id = "google/gemma-2b-it"
#model_id = "LeoLM/leo-hessianai-13b-chat-bilingual"

access_token = "<REPLACE_WITH_ACCESS_TOKEN>" # Provided by instructor
dtype = torch.bfloat16

tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda",
    torch_dtype=dtype,
    token = access_token,
    trust_remote_code=True,
    attn_implementation="flash_attention_2"
)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

# 2. Run inference on the model for testing purposes
Play around with the arguments for the generation process. Setting `temperature` to a higher value will increase randomness.

In [None]:
generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.5,
    "do_sample": True,
}

messages = [
    {"role": "user", "content": "What is the capital of Germany?"},
    {"role": "assistant", "content": "The capital of Germany is Berlin. Berlin has around 3 Million inhabitants. The most important attraction is the 'Brandenburger Tor'"},
    {"role": "user", "content": "What is the capital of Italy?"},
]

output = pipe(messages, **generation_args)
print(output[0]['generated_text'])

# 3. Load transcripts from folder and perform the same action for each

Set the generation parameters for the following transcripts.

In [None]:
generation_args = {
    "max_new_tokens": 1000,
    "return_full_text": False,
    "temperature": 0.2,
    "do_sample": True,
}

Define a folder to save the LLM's output for each transcript to:

In [None]:
LLM_OUTPUT_FOLDER = "llm_output"

import os

# Check if the folder exists, if not, create it
if not os.path.exists(LLM_OUTPUT_FOLDER):
    os.makedirs(LLM_OUTPUT_FOLDER)
    print(f"Folder '{LLM_OUTPUT_FOLDER}' created.")
else:
    print(f"Folder '{LLM_OUTPUT_FOLDER}' already exists.")

Create a function that takes a transcript and creates a prompt object for the LLM:

In [None]:
def create_prompt_messages(prompt, transcript):
    prompt_with_input = f"{prompt}\n\nHere is the transcript: '{transcript.strip()}'"
    messages = [
        {"role": "user", "content": prompt_with_input}
    ]
    return messages

Define a function to write LLM results to a file

In [None]:
import csv

def write_result_to_file(llm_results, output_file_name):
    
    fields = ["yt_id", "transcript", "llm_result"]

    csv_file = f"{LLM_OUTPUT_FOLDER}/{output_file_name}" 
    
    # Write data to the CSV file
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fields, quoting=csv.QUOTE_ALL, escapechar='\\')
        
        # Write the header row
        writer.writeheader()
        
        # Write the data rows
        for row in llm_results:
            writer.writerow(row)

    print(f"Data successfully written to {csv_file}")   

Iterate through all files in the transcript folder and process with LLM:

In [None]:
import os
import glob

# Define the folder path
folder_path = "transcripts"

# Get a list of all text files in the folder
transcript_files = glob.glob(os.path.join(folder_path, '*.txt'))

rows = []

counter = 0

# Iterate through the list of text files
for file_path in transcript_files:

    counter += 1
    if counter > 5:
        break
        
    with open(file_path, 'r', encoding='utf-8') as file:
        
        # Read the content of the file and append it to the variable
        transcript = file.read()
        print(f"{counter} File: {file_path} with length {len(transcript)}")

        # Create the messages object using the previously defined function
        messages = create_prompt_messages("List five words that capture the content of the video! Return only the five words! All words must be in GERMAN! Do not number the words!", transcript)

        try:
            output = pipe(messages, **generation_args)
            llm_result = output[0]['generated_text']
            row = { "yt_id" : file_path.replace(".txt", ""), "transcript": transcript, "llm_result": llm_result }
            rows.append(row)
        except Exception as e:
            print(f"Error: {e}")

write_result_to_file(rows, "llm_results.csv")

## Estimating the tokens in a given text
Helpful tool to check how many tokens a text contains: [OpenAI Tokenizer](https://platform.openai.com/tokenizer)

A rule of thumbs is to devide the length in characters by 4.

# 4. Load transcripts from CSV file and run prompt against each

In [None]:
import csv

# Path to the CSV file
file_path = 'transcripts/transcripts.csv'

llm_results = []

counter = 0

# Open the CSV file and read it
with open(file_path, mode='r', encoding='utf-8') as file:
    csv_reader = csv.DictReader(file)
    for row in csv_reader:

        counter += 1
        if counter > 20:
            break
            
        yt_id = row['yt_id']
        transcript = row['transcript']

        print(f"{counter} yt_id: {yt_id} with transcript length {len(transcript)}")
    
        # Create the messages object using the previously defined function
        messages = create_prompt_messages("List five words that capture the content of the video! Return only the five words! All words must be in GERMAN! Do not number the words!", transcript)


        try:
            output = pipe(messages, **generation_args)
            llm_result = output[0]['generated_text']
            row = { "yt_id" : yt_id, "transcript": transcript, "llm_result": llm_result }
            llm_results.append(row)
        except Exception as e:
            print(f"Error: {e}")
        
    write_result_to_file(llm_results, f"{LLM_OUTPUT_FOLDER}/llm_results_combined.csv")

# 5. Merge results from multiple CSV files into a single file

In [None]:
import pandas as pd
import glob

input_folder = f"{LLM_OUTPUT_FOLDER}"
output_file_name = "llm_results_combined.csv"

# Get a list of all CSV files in the output directory
all_files = glob.glob(input_folder + "/*.csv")

# Create an empty list to store DataFrames
dfs = []

# Loop through each file and read it into a DataFrame, then append to the list
for filename in all_files:
    print(filename)
    df = pd.read_csv(filename)
    dfs.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Write the combined DataFrame to a new CSV file
combined_df.to_csv(f"{input_folder}/{output_file_name}", index=False)