In [10]:
import pandas as pd
import random
from pys.data import combined_df, filtered_labels_at_least_5_list
from pys.llama import LLAMA31_8B_LLM

In [11]:
def rephrase(description, author):
    """
    Rephrases the given description as if it were written by the specified author.
    
    Args:
        description (str): The text to be rephrased.
        author (str): The name of the person whose writing style is to be emulated.
        
    Returns:
        str: The rephrased description.
    """
    message = LLAMA31_8B_LLM.invoke(
        f"Rewrite the following description as if it were written by {author['author_title']} who is an expert in {author['expertise']}, without introducing or describing the rewritten text. Provide only the rephrased version: {description}"
    )
    return message.content


In [12]:
authors = [
    {
        "author_title": "Senior Developer",
        "expertise": "Unix commands, software development tools, and debugging utilities."
    },
    {
        "author_title": "Security Researcher",
        "expertise": "Network protocols, interprocess communication, and hardware drivers."
    },
    {
        "author_title": "Database Architect",
        "expertise": "Designing, scaling, and managing distributed and graph databases."
    },
    {
        "author_title": "Software Engineer",
        "expertise": "Developing cross-platform applications, robotics software, and open-source projects."
    },
    {
        "author_title": "Technical Writer",
        "expertise": "Documenting technical tools, APIs, and usage guides for software and hardware."
    },
    {
        "author_title": "Robotics Developer",
        "expertise": "Developing software for robotics, complex systems, and distributed component architectures."
    }
]

In [13]:
df = combined_df
num_of_labels = len(filtered_labels_at_least_5_list)
grouped_df = df.groupby('Artifact Id')
df_dict = {artifact_id: group for artifact_id, group in grouped_df}
label_dfs = {}

for label in filtered_labels_at_least_5_list:
    label_dfs[label] = df_dict.get(label, pd.DataFrame())

In [14]:
new_rows = []

for label, label_df in label_dfs.items():
    print(f"Processing DataFrame for label: {label}")
    for index, row in label_df.iterrows():
        description = row['Example Description']
        for author in authors :
            new_description = rephrase(description, author)
            
            updated_row = row.copy()
            updated_row['Example Description'] = new_description
            updated_row['Example Source'] = "LLAMA3"
            
            new_rows.append(updated_row)

Processing DataFrame for label: d3f:Command


Processing DataFrame for label: d3f:Database
Processing DataFrame for label: d3f:Software
Processing DataFrame for label: d3f:HardwareDriver
Processing DataFrame for label: d3f:DisplayServer
Processing DataFrame for label: d3f:OperatingSystem
Processing DataFrame for label: d3f:FileSystem
Processing DataFrame for label: d3f:BootLoader
Processing DataFrame for label: d3f:InterprocessCommunication


In [15]:
updated_df = pd.DataFrame(new_rows)
updated_df.to_csv('../csv/dataset_aug_less.csv', index=False)

In [16]:
merged_df = pd.concat([combined_df, updated_df], ignore_index=True)
merged_df.to_csv('../csv/merged_aug_less.csv', index=False)

In [17]:
print(len(merged_df))

198


In [18]:
print(len(combined_df))
print(len(authors))
print(len(updated_df))

99
6
99


In [19]:
import re

In [20]:
message = LLAMA31_8B_LLM.invoke("What")
message_content = message.content

sentences = re.split(r'(?<=[.!?]) +', message_content)

for sentence in sentences:
    print(sentence)

It seems like your question is cut off.
Could you please provide more context or clarify what you would like to know?
I'll do my best to assist you.
