In [1]:
from sentence_transformers import SentenceTransformer
import torch

import os
import json
import pandas as pd
import numpy as np
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def build_dataset(data_dir):
    # List to store the text and labels
    dataset = []

    # Walk through the directory structure
    for source_folder in os.listdir(data_dir):
        source_folder_path = os.path.join(data_dir, source_folder)
        
        # We expect only two subfolders: 'real' and 'fake'
        for label_folder in ['real', 'fake']:
            label_folder_path = os.path.join(source_folder_path, label_folder)
            
            # Skip if the directory doesn't exist (it could happen in case of missing data)
            if not os.path.exists(label_folder_path):
                continue

            # Set label based on the folder (1 for real, 0 for fake)
            label = 1 if label_folder == 'real' else 0
            
            # Loop through the 'some_folder' subfolders
            for some_folder in os.listdir(label_folder_path):
                some_folder_path = os.path.join(label_folder_path, some_folder)

                # Skip if it's not a directory
                if not os.path.isdir(some_folder_path):
                    continue

                # Now loop through the JSON files in the 'some_folder'
                for json_file_name in os.listdir(some_folder_path):
                    if json_file_name.endswith('.json'):
                        json_file_path = os.path.join(some_folder_path, json_file_name)
                        
                        # Read the JSON file
                        with open(json_file_path, 'r', encoding='utf-8') as json_file:
                            data = json.load(json_file)
                            
                            # Extract the 'text' field
                            text = data.get('text', '')
                            
                            # Append to the dataset (text and label)
                            dataset.append({'text': text, 'label': label, 'source': source_folder})

    # Convert the list of dicts into a Pandas DataFrame
    df = pd.DataFrame(dataset)

    return df

In [3]:
model = SentenceTransformer("all-mpnet-base-v2")

In [7]:
# Define the root folder of the dataset
data_dir = "/home/ubuntu/FakeNewsNet/code/fakenewsnet_dataset"
df = build_dataset(data_dir)

# Generate embeddings for the 'text' column
embeddings = model.encode(df['text'].tolist(), convert_to_tensor=True)

# Add the embeddings to the dataframe as a new column
df['embedding'] = embeddings.tolist()

In [9]:
df['label'].value_counts()

label
1    11433
0     5239
Name: count, dtype: int64

In [12]:

def word_count(text):
    return len(text.split())

# Filter rows where the word count is 30 or more
df_filtered = df[df['text'].apply(word_count) >= 100]
df_filtered2 = df_filtered[df_filtered['text'].apply(word_count) <= 700]



# Separate the dataframe by label
df_label_1 = df_filtered2[df_filtered2['label'] == 1]
df_label_0 = df_filtered2[df_filtered2['label'] == 0]

# Get the count of the minority class (label = 0)
minority_count = len(df_label_0)

# Downsample the majority class (label = 1) to match the minority count
df_label_1_downsampled = df_label_1.sample(n=minority_count, random_state=123)

# Combine the downsampled majority class with the minority class
df_balanced = pd.concat([df_label_1_downsampled, df_label_0])

# Shuffle the resulting dataframe
df_balanced = df_balanced.sample(frac=1, random_state=123).reset_index(drop=True)

# Check the label distribution in the new dataframe
print(df_balanced['label'].value_counts())

label
0    3480
1    3480
Name: count, dtype: int64


In [13]:
pkl_path = '/home/ubuntu/fedatk_unl_tj/data/fakenewsnet/raw_data/data.json'
# df_filtered2.to_json(pkl_path, orient='records', lines=True)
df_dict = df_balanced.to_dict(orient='records')

with open(pkl_path, 'w') as json_file:
    json.dump(df_dict, json_file, indent=4)