# Creates the fake_news_llama_3_8b_instruct_processed.csv
###### The generated "fake_news_llama_3_8b_instruct.csv" file has the title, subtitle and the actual fake news all under the same column "fake_news". The goal of this script is to split that into different columns. For the purpose of this project, models will be trained using only the column "news" and "class".

In [3]:
import pandas as pd
import re

# Load the dataset with the correct encoding
file_path = './fake_news_llama_3_8b_instruct.csv'
df = pd.read_csv(file_path, encoding='utf-8')

# Convert all columns to lowercase
df = df.apply(lambda x: x.str.lower() if x.dtype == "object" else x)

# Improved function to extract sections
def extract_section(text, section, section_en):
    # Patterns with and without ** notation
    patterns = [
        rf"\*\*{section}:\*\*(.*?)(?=\*\*|$)",
        rf"{section}:(.*?)(?:\*\*|título|subtítulo|conteúdo|$)",
        rf"{section_en}:(.*?)(?:\*\*|title|subtitle|content|$)"
    ]
    for pattern in patterns:
        match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
        if match:
            return match.group(1).strip()

    # Post-processing to handle missing "conteúdo" or "content" sections
    if section.lower() == "conteúdo" or section_en.lower() == "content":
        # Define keywords that mark the start of sections
        start_keywords = ["**título:**", "**subtítulo:**", "título:", "subtítulo:", "**title:**", "**subtitle:**", "title:", "subtitle:"]

        # Find the positions of start keywords
        positions = [m.end() for keyword in start_keywords for m in re.finditer(re.escape(keyword), text, re.IGNORECASE)]

        if positions:
            # Content starts after the last start keyword
            content_start = max(positions)

            # Find the position of the next newline after content_start
            next_newline = text.find('\n', content_start)

            if next_newline != -1:
                # Extract content after the next newline
                return text[next_newline:].strip()
            else:
                # If no newline is found, return from content_start to end
                return text[content_start:].strip()

    return ""

# Extract Título(title), Subtítulo(subtitle), and Conteúdo(content)
df['title'] = df['fake_news'].apply(lambda x: extract_section(x, 'título', 'title'))
df['subtitle'] = df['fake_news'].apply(lambda x: extract_section(x, 'subtítulo', 'subtitle'))
df['news'] = df['fake_news'].apply(lambda x: extract_section(x, 'conteúdo', 'content'))

# Add a new column 'class' with all entries set to 0
df['class'] = 0

# Save the processed dataframe to a new CSV file
output_path = './fake_news_llama_3_8b_instruct_processed.csv'
df.to_csv(output_path, index=False)

print("Processing complete. Processed file saved to:", output_path)

Processing complete. Processed file saved to: ./fake_news_llama_3_8b_instruct_processed.csv
