In [1]:
import replicate
from groq import Groq
import os
from dotenv import load_dotenv

load_dotenv()  # Load variables from .env into the environment

api_key = os.getenv("GROQ_API_KEY")

In [2]:
import json

def load_json(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    return data

In [3]:
def run_replicate(prompt, max_tokens):
    input = {
        "top_p": 0.9,
        "prompt": prompt,
        "temperature": 0.6,
        "presence_penalty": 1.15,
        "max_tokens": max_tokens
        }

    output = replicate.run(
            "meta/meta-llama-3-8b",
            input=input
        )
    return output

In [11]:
def run_groq(client, prompt, max_tokens, system_prompt, response_format=False):
    format = {"type": "json_object"} if response_format else None
    completion = client.chat.completions.create(
    model="llama-3.3-70b-versatile",
    messages=[
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": prompt
        }
    ],
    temperature=1,
    max_completion_tokens=max_tokens,
    top_p=1,
    stream=False,
    stop=None,
    response_format=format
    )
    return completion.choices[0].message.content
    

In [22]:
def write_file(name: str, bias_splits: dict, original_json: dict):
  for k, v in bias_splits.items():
    original_json[k] = v
  
  with open(name, "w") as f:
    json.dump(original_json, f, indent=4)

def generate_text_groq(idx, path, groq_client):
    
    json_data = load_json(path)
    word_count = len(json_data["original_text"].split())

    print("generating for id = ", idx, "word count = ", word_count)

    system_prompt = "Continue the following text, keeping the same tone"
    
    output = run_groq(groq_client, json_data["original_text"], word_count * 2, system_prompt)
    print("generated sequence of length = ", len("".join(output).split()))
    if len("".join(output)) > 0:
        write_file(path, "".join(output), json_data)

    

In [6]:
import glob
seen_files = [x.split("_")[1] for x in glob.glob("*.json", root_dir="generated_text")]
print(len(seen_files))

3537


In [10]:
import time

client = Groq(api_key=api_key)

files = glob.glob("generated_text/*.json")
for idx, file in enumerate(files):
    print("investigating file ", file)
    try:
        generate_text_groq(idx, file, client)
        time.sleep(5)
    except Exception as e:
        print("error! on file = ", file)
        print(e)

investigating file  generated_text/generated_Ojbbi0XTvX4wv0Cw.json
generating for id =  0 word count =  601
generated sequence of length =  482
investigating file  generated_text/generated_GDYjPxH9jfKDM5pV.json
generating for id =  1 word count =  1127
generated sequence of length =  410
investigating file  generated_text/generated_0NIUHGaXx1WeDpqh.json
generating for id =  2 word count =  1543
generated sequence of length =  495
investigating file  generated_text/generated_LgMWgfrCKX1uHpmF.json
generating for id =  3 word count =  2011
generated sequence of length =  577
investigating file  generated_text/generated_GieU63gDdvuYqWgM.json
generating for id =  4 word count =  988
generated sequence of length =  462
investigating file  generated_text/generated_JMISI7xF4lvuwCuu.json
generating for id =  5 word count =  798
generated sequence of length =  466
investigating file  generated_text/generated_JUqTHLoBAcrUTWqB.json
generating for id =  6 word count =  1112
generated sequence of le

In [18]:
def write_file(name: str, bias_splits: dict, original_json: dict):
  for k, v in bias_splits.items():
    original_json[k] = v
  
  with open(name, "w") as f:
    json.dump(original_json, f, indent=4)

In [13]:
def generate_bias_groq(prompt, client):
    system_prompt = "classify the following text with one of the following biases: left, center, right. Do this in the following json format {'bias' : 'text'}"
    output = run_groq(client, prompt, 1024, system_prompt, True)
    return json.loads(output)['bias']


In [14]:
for idx, file in enumerate(files):
    print(f"investigating id={idx} file={file}")
    try:
        json_data = load_json(file)
        llama_original_bias = generate_bias_groq(json_data['original_text'], client)
        llama_generated_bias = generate_bias_groq(json_data['generated_text'], client)
        write_file(
            file, 
            {'llama_original_bias': llama_original_bias, 'llama_generated_bias' : llama_generated_bias},
            json_data
        )
        time.sleep(5)
    except Exception as e:
        print("error! on file = ", file)
        print(e)

investigating id=0 file=generated_text/generated_Ojbbi0XTvX4wv0Cw.json
investigating id=1 file=generated_text/generated_GDYjPxH9jfKDM5pV.json
investigating id=2 file=generated_text/generated_0NIUHGaXx1WeDpqh.json
investigating id=3 file=generated_text/generated_LgMWgfrCKX1uHpmF.json
investigating id=4 file=generated_text/generated_GieU63gDdvuYqWgM.json
investigating id=5 file=generated_text/generated_JMISI7xF4lvuwCuu.json
investigating id=6 file=generated_text/generated_JUqTHLoBAcrUTWqB.json
investigating id=7 file=generated_text/generated_DVZ0BaagJH912RIl.json
investigating id=8 file=generated_text/generated_2WCsVSwrGB6fs9ly.json
investigating id=9 file=generated_text/generated_VfW5qRiCbOmJQvkn.json
investigating id=10 file=generated_text/generated_SwBNgGJB7Kj5mUPa.json
investigating id=11 file=generated_text/generated_TZbz8FXvoIBcnoza.json
investigating id=12 file=generated_text/generated_OjRltkz99X9RFtk8.json
investigating id=13 file=generated_text/generated_7kzIZs7iteLzkBV5.json
in

In [15]:
def generate_topic_groq(client, loaded_json):
    system_prompt = "Are the following two texts on roughly the same topic? Use the following JSON format: {'same_topic': boolean}"
    prompt = json.dumps({
        'text_1' : loaded_json['generated_text'],
        'text_2' : loaded_json['original_text']
    })
    output = run_groq(client, prompt, 1024, system_prompt, True)
    return json.loads(output)['same_topic']

In [16]:
for idx, file in enumerate(files):
    print(f"investigating id={idx} file={file}")
    try:
        json_data = load_json(file)
        llama_same_topic = generate_topic_groq(client, json_data)
        write_file(
            file, 
            {'llama_same_topic': llama_same_topic},
            json_data
        )
        time.sleep(5)
    except Exception as e:
        print("error! on file = ", file)
        print(e)

investigating id=0 file=generated_text/generated_Ojbbi0XTvX4wv0Cw.json
investigating id=1 file=generated_text/generated_GDYjPxH9jfKDM5pV.json
investigating id=2 file=generated_text/generated_0NIUHGaXx1WeDpqh.json
investigating id=3 file=generated_text/generated_LgMWgfrCKX1uHpmF.json
investigating id=4 file=generated_text/generated_GieU63gDdvuYqWgM.json
investigating id=5 file=generated_text/generated_JMISI7xF4lvuwCuu.json
investigating id=6 file=generated_text/generated_JUqTHLoBAcrUTWqB.json
investigating id=7 file=generated_text/generated_DVZ0BaagJH912RIl.json
investigating id=8 file=generated_text/generated_2WCsVSwrGB6fs9ly.json
investigating id=9 file=generated_text/generated_VfW5qRiCbOmJQvkn.json
investigating id=10 file=generated_text/generated_SwBNgGJB7Kj5mUPa.json
investigating id=11 file=generated_text/generated_TZbz8FXvoIBcnoza.json
investigating id=12 file=generated_text/generated_OjRltkz99X9RFtk8.json
investigating id=13 file=generated_text/generated_7kzIZs7iteLzkBV5.json
in

In [25]:
for idx, file in enumerate(files):
    loaded = load_json(file)
    try:
        if 'generated_text' not in loaded:
            print(f"generating text {file}")
            res = generate_text_groq(idx, file, client)
            loaded = load_json(file)
        if 'llama_original_bias' not in loaded:
            print(f"generating original bias {file}")
            res = generate_bias_groq(loaded['original_text'], client)
            write_file(file, {'llama_original_bias': res}, loaded)
            loaded = load_json(file)
        if 'llama_generated_bias' not in loaded:
            print(f"generating generated bias {file}")
            res = generate_bias_groq(loaded['generated_text'], client)
            write_file(file, {'llama_generated_bias': res}, loaded)
            loaded = load_json(file)
        if 'llama_same_topic' not in loaded:
            res = generate_topic_groq(client, loaded)
            write_file(file, {'llama_same_topic': res}, loaded)
    except Exception as e:
        print("error! :", e)
        continue

