In [None]:
import pandas as pd
df = pd.read_csv('/Users/akhil/Downloads/final_test.tsv', sep='\t')
df.head()

In [27]:
import vertexai
from vertexai.generative_models import GenerationConfig, GenerativeModel, Part
import json
import time

# Initialize Vertex AI
PROJECT_ID = "gen-lang-client-0539303742"
LOCATION = "us-south1"  # Change to your region
MODEL_ID = "gemini-1.5-pro-001"  # Replace with the correct model ID for Gemini
OUTPUT_FILE = "modified_data.json"
instr = "I will provide news articles data in JSON format, structured across multiple fields. Your task is to generate a subtly modified version that retains the misleading nature of the original article while employing different wording, structure, or details that could effectively evade detection by various common fake news detection models, including a pre-trained RoBERTa model. The modified text must remain coherent, plausible, and natural. Change the language to be moderately sensational and keep the details of the article vague. Ensure the tone is neutral, modifying the content as if it is coming from a popular and reliable source, while still incorporating phrases like ""buzz"" and its synonyms to enhance the sensational aspect. Additionally, ensure that the comments also make the article more difficult to detect by the pre-trained model. Each article will have the columns: content, comment, and category. Every modified article should not have any of these empty: content, comment and category. Do not change the category of the row. Please provide the modified version as a single, cohesive text for each row. If there are any names of people mentioned in the news, keep them as they are. The response should be in valid json. Do not give any intro or outro, or explanation etc. All the three are strings like regular text, they should not contain weird start and stop characters."

vertexai.init(project=PROJECT_ID, location="us-central1")
model = GenerativeModel(model_name=MODEL_ID, system_instruction=instr)
config = GenerationConfig(temperature=2, max_output_tokens=8192, top_p=0.95, response_mime_type="application/json")

In [None]:
pip install pyarrow

In [29]:
pip install json_repair

Collecting json_repair
  Downloading json_repair-0.30.2-py3-none-any.whl.metadata (11 kB)
Downloading json_repair-0.30.2-py3-none-any.whl (18 kB)
Installing collected packages: json_repair
Successfully installed json_repair-0.30.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import json
from json_repair import repair_json  # Ensure this library is installed
import time

# Load the .tsv file
file_path = "/Users/akhil/Downloads/final_test.tsv"  # Replace with your file path
data = pd.read_csv(file_path, sep='\t')

# Ensure required columns are present
if 'clean_title' not in data.columns or '6_way_label' not in data.columns:
    raise ValueError("The input file must contain 'clean_title' and '6_way_label' columns.")

# Configuration
batch_size = 84 # Adjust the batch size as needed
OUTPUT_FILE = "modified_data.json"
FAILED_FILE = "failed_batches.json"

# Lists to hold results and errors
modified_data = []
failed_batches = []

# Batch processing loop
for iter, i in enumerate(range(0, len(data), batch_size)):
    batch_df = data.iloc[i : i + batch_size]  # Extract batch
    print(f"Processing batch {iter + 1}/{(len(data) + batch_size - 1) // batch_size} (Rows {i} to {min(i + batch_size - 1, len(data) - 1)})...")

    # Convert the batch DataFrame to JSON
    batch_json = batch_df.to_json(orient="records")

    # Create prompt for the batch
    prompt = f"""
    Modify the title (clean_title) of the following articles while keeping the 6_way_label unchanged.
    Articles (JSON):
    {batch_json}
    """

    try:
        # Generate content using the model
        response = model.generate_content([prompt], generation_config=config)
        modified_text = response.text

        # Ensure response is not empty
        if not modified_text.strip():
            raise ValueError(f"Empty response for batch {iter + 1}")

        # Parse the modified JSON response
        json_objects = json.loads(modified_text)

        # Process each object to sanitize the output
        sanitized_objects = []
        for original, modified in zip(batch_df.to_dict(orient="records"), json_objects):
            # Ensure `clean_title` is modified but `6_way_label` remains unchanged
            sanitized_object = {
                "clean_title": modified.get("clean_title", original["clean_title"]),
                "6_way_label": original["6_way_label"]  # Preserve the original label
            }
            sanitized_objects.append(sanitized_object)

        modified_data.extend(sanitized_objects)  # Add sanitized objects to the list
        print(f"Batch {iter + 1} processed successfully!")

    except json.JSONDecodeError as e:
        print(f"Error decoding JSON in batch {iter + 1}: {e}")

        # Attempt to repair the JSON if it's malformed
        repaired_text = repair_json(modified_text)
        if repaired_text:
            try:
                json_objects = json.loads(repaired_text)

                # Process repaired JSON for sanitization
                sanitized_objects = []
                for original, modified in zip(batch_df.to_dict(orient="records"), json_objects):
                    sanitized_object = {
                        "clean_title": modified.get("clean_title", original["clean_title"]),
                        "6_way_label": original["6_way_label"]  # Preserve the original label
                    }
                    sanitized_objects.append(sanitized_object)

                modified_data.extend(sanitized_objects)
                print(f"Successfully repaired JSON for batch {iter + 1}.")
            except Exception as validation_error:
                print(f"Validation failed for repaired JSON in batch {iter + 1}: {validation_error}")
                failed_batches.append({"batch": iter + 1, "response": repaired_text, "prompt": prompt})
        else:
            failed_batches.append({"batch": iter + 1, "response": modified_text, "prompt": prompt})
    except Exception as e:
        print(f"Unexpected error in batch {iter + 1}: {e}")
        failed_batches.append({"batch": iter + 1, "response": modified_text if 'modified_text' in locals() else None, "prompt": prompt})
    

    # Save progress periodically
    if iter % 10 == 0:  # Save every 10 batches
        with open(OUTPUT_FILE, "w") as json_file:
            json.dump(modified_data, json_file, indent=4)
        print(f"Intermediate data saved to {OUTPUT_FILE}")

    # time.sleep(0.75)  # Respect API rate limits
2
# Save the final modified data to a JSON file
with open(OUTPUT_FILE, "w") as json_file:
    json.dump(modified_data, json_file, indent=4)
print(f"Modified data saved to {OUTPUT_FILE}")

# Save any failed batches for debugging
if failed_batches:
    with open(FAILED_FILE, "w") as failed_file:
        json.dump(failed_batches, failed_file, indent=4)
    print(f"Failed batches saved to {FAILED_FILE}")


Processing batch 1/427 (Rows 0 to 83)...
Batch 1 processed successfully!
Intermediate data saved to modified_data.json
Processing batch 2/427 (Rows 84 to 167)...
Batch 2 processed successfully!
Processing batch 3/427 (Rows 168 to 251)...
Batch 3 processed successfully!
Processing batch 4/427 (Rows 252 to 335)...
Batch 4 processed successfully!
Processing batch 5/427 (Rows 336 to 419)...
Batch 5 processed successfully!
Processing batch 6/427 (Rows 420 to 503)...
Batch 6 processed successfully!
Processing batch 7/427 (Rows 504 to 587)...
Batch 7 processed successfully!
Processing batch 8/427 (Rows 588 to 671)...
Batch 8 processed successfully!
Processing batch 9/427 (Rows 672 to 755)...
Batch 9 processed successfully!
Processing batch 10/427 (Rows 756 to 839)...
Batch 10 processed successfully!
Processing batch 11/427 (Rows 840 to 923)...
Batch 11 processed successfully!
Intermediate data saved to modified_data.json
Processing batch 12/427 (Rows 924 to 1007)...
Batch 12 processed success