In [None]:
!pip install vllm pandas

In [2]:
from vllm import LLM, SamplingParams

In [None]:
llm = LLM(model="TheBloke/Mistral-7B-Instruct-v0.2-AWQ", quantization = 'awq', dtype="auto")

In [12]:
sampling_params = SamplingParams(temperature=0.9, top_k=50, max_tokens=200)

In [58]:
lyrics_yaml_prompt = """Analyze song lyrics and the info about them which may provide additional context about the lyrics but it also could be irrelevant and output just the four main themes in a yaml format with the four keys being named theme_1, theme_2, theme_3 and theme_4. Only output the yaml format I have given in the examples, do not add anything else to it and do not output things like specifics, descriptions, notes, etc. Here is an example of what your output should be:
        ```yaml
		theme_1: poverty
		theme_2: hard work
		theme_3: success
		theme_4: wealth
        ```  
  Here is another example:
        ```yaml
        theme_1: pride
		theme_2: envy
		theme_3: lust
		theme_4: hatred
        ``` 
 """

annotations_yaml_prompt = """Analyze song lyrics and the info about them which may provide additional context about the lyrics but it also could be irrelevant and output just the two main themes in a yaml format with the two keys being named theme_1, and theme_2. Only output the yaml format I have given in the examples, do not add anything else to it and do not output things like specifics, descriptions, notes, etc. Here is an example of what your output should be:
        ```yaml
		theme_1: hard work
		theme_2: success
        ```  
  Here is another example:
        ```yaml
        theme_1: pride
		theme_2: envy
        ``` 
 """

system_message = "Below is an instruction that describes a task. Write a response that appropriately completes the request."


In [36]:
input_template = """
{}
[lyrics]
{}
[info]
{}
"""

instruct_prompt = '''
<s>[INST]
{}
{} 
[/INST]
'''

In [72]:
import pandas as pd
lyrics_full_df = pd.read_csv('lyrics_dataset_clean.csv')
annotations_full_df = pd.read_csv('annotations_dataset.csv')
lyrics_df = lyrics_full_df.head(100)
annotations_df = annotations_full_df.head(100)

In [None]:
def create_prompt(prompt, lyrics, info):
    input = input_template.format(prompt, lyrics, info)
    prompt = instruct_prompt.format(system_message, input)
    return prompt

def generate_prompt(row, type): 
    if type == 'lyrics':
        prompt = create_prompt(lyrics_yaml_prompt, row['song_lyrics'], row['song_description'])
    elif type == 'annotations':
       prompt = create_prompt(annotations_yaml_prompt, row['referent'], row['annotation']) 
    else:
        return None
    return prompt 

themes_num = 4

def format_themes(themes):
    parts = themes.split("theme_")
    if len(parts) < themes_num + 1:
        return "No topics available"

    themes = []
    for i in range(1, themes_num + 1):  
        part = parts[i].split(":", 1) 
        if len(part) == 2:
            themes.append(part[1].strip().replace('```', ''))  
        else:
            return "No topics available"

    return ", ".join(themes)

In [None]:
import pandas as pd
import math

def create_batches(df, batch_size):
    """Split DataFrame into a list of batches."""
    total_rows = len(df)
    num_batches = math.ceil(total_rows / batch_size)
    return [df[i*batch_size:(i+1)*batch_size] for i in range(num_batches)]

def process_data(df, batch_size, llm):
    batches = create_batches(df, batch_size)
    result_df = pd.DataFrame()
    
    num_batches = len(batches)
    for i, batch_df in enumerate(batches):
        prompts = [generate_prompt(row, 'annotations') for _, row in batch_df.iterrows()]
        
        outputs = llm.generate(prompts, sampling_params) 
        
        generated_themes = [format_themes(output.outputs[0].text) for output in outputs]
        
        batch_df['themes'] = generated_themes
        
        result_df = pd.concat([result_df, batch_df], ignore_index=True)
        
        if i % 100 == 0:
            print(f'batch: {i}/{num_batches}')

    return result_df




In [None]:
themes_num = 2
batch_size = 8
result_df = process_data(annotations_full_df, batch_size, llm)

In [None]:
annotations_final_df.to_csv('final_annotations_dataset.csv', index=False)