# Generate Large Datasets:

In [None]:
from utils import Generator
import os
generator = Generator()

def clean_columns(df):
    for column in ['__index_level_0__']:
        if column in df.columns:
            df = df.drop(columns=[column])
    return df

In [None]:
n_samples = 1000

In [None]:
for dataset_name in ['netzero_reduction',
 'climate_specificity',
 'climate_sentiment',
 'climate_commitments_actions',
 'climate_detection',
 'climate_tcfd_recommendations',
 'climatext',
 'environmental_claims',
 'ClimaTOPIC',
 'climateFEVER_claim',
 'climateBUG_data',
#  'lobbymap_pages',
 'sustainable_signals_review',
 'esgbert_e',
 'esgbert_s',
 'esgbert_g',
 'esgbert_action500',
 'esgbert_category_water',
 'esgbert_category_forest',
 'esgbert_category_biodiversity',
 'esgbert_category_nature',
 'sciDCC',
 'green_claims',
 'green_claims_3',
 'contrarian_claims',
 'climateStance',
 'climateEng',
#  'ClimaINS',
 'ClimaINS_ours',
 'gw_stance_detection',
#  'lobbymap_stance',
 'climateFEVER_evidence',
#  'climateFEVER_evidence_climabench',
 'climaQA',
#  'lobbymap_query',
 'logicClimate']:
    print(dataset_name)
    train, test, dev, _ = generator.load_dataset(dataset_name=dataset_name)

    if "clean_text" in test.columns:
        test = test[['clean_text', 'label']].copy()
        test.rename(columns={'clean_text':'text'}, inplace=True)

    if len(test) <= 1000:
        subtest = test.copy()
    else:
        subtest = generator.dataset_builder.weighted_random_sampling(data=test, label_column=generator.args[dataset_name]['label_columns'], n_samples=n_samples)
    
    subtest = clean_columns(subtest)

    subtest.to_parquet(os.path.join(os.getcwd(), "data", "llm_green_nlp_tasks", f"{dataset_name}.pkl"))    

In [None]:
import json

def read_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
    return data

import pandas as pd
from ast import literal_eval

def format_page_dataset(json_data):
    sentences = []
    label = []
    sentences_id = []
    document_id = []
    page_id = []

    for document in json_data:
        _sentence_ids = []
        for evidence in document['meta']['evidences']:
            _sentence_ids += evidence[0]['sentence_ids']
            
        for sentence in document["sentences"]:
            sentences.append(sentence['text'])
            label.append(1*(sentence['sentence_id'] in _sentence_ids ))
            sentences_id.append(sentence['sentence_id'])
            document_id.append(document['document_id'])
            page_id.append(sentence['page_idx'])

    return pd.DataFrame({
        'sentences': sentences,
        'label': label,
        'sentence_id': sentences_id,
        'document_id': document_id,
        'page_id': page_id
    })
        

folder_path = "data\\lobbymap\\lobbymap_dataset"

# train
file_path = folder_path + "\\train.jsonl"
jsonl_train = read_jsonl(file_path)
df_train = format_page_dataset(jsonl_train)


file_path = folder_path + "\\test.jsonl"
jsonl_test = read_jsonl(file_path)
df_test = format_page_dataset(jsonl_test)


file_path = folder_path + "\\valid.jsonl"
jsonl_dev = read_jsonl(file_path)
df_dev = format_page_dataset(jsonl_dev)


def format_query_dataset(json_data):
    df = pd.DataFrame()

    for document in json_data:
        page_ids = []
        query = []
        stance = []
        for evidence in document['evidences']:
            page_ids += [evidence['page_indices']]
            query += [evidence['query']]
            stance += [evidence['stance']]
        
        _df = pd.DataFrame({
        'page_id': page_ids,
        "query": query,
        "stance": stance
        })
        
        _df['document_id'] = document['document_id']
        _df = _df.explode(column=['page_id'])
        df = pd.concat([df, _df])

    return df


df_query_train = format_query_dataset(jsonl_train)
df_query_test = format_query_dataset(jsonl_test)
df_query_dev = format_query_dataset(jsonl_dev)

df_train.sort_values(by=['sentence_id'], inplace=True)
df_test.sort_values(by=['sentence_id'], inplace=True)
df_dev.sort_values(by=['sentence_id'], inplace=True)

page_train = df_train.groupby(['document_id', 'page_id'])['sentences'].apply(lambda x: ' '.join(x))
page_test = df_test.groupby(['document_id', 'page_id'])['sentences'].apply(lambda x: ' '.join(x))
page_dev = df_dev.groupby(['document_id', 'page_id'])['sentences'].apply(lambda x: ' '.join(x))

page_train = page_train.reset_index()
page_test = page_test.reset_index()
page_dev = page_dev.reset_index()

page_train = page_train.merge(df_query_train, how="left", on=["document_id", "page_id"])
page_test = page_test.merge(df_query_test, how="left", on=["document_id", "page_id"])
page_dev = page_dev.merge(df_query_dev, how="left", on=["document_id", "page_id"])

In [None]:
from src.builder import clean_text
page_test['text'] = page_test['sentences'].apply(clean_text)
page_train['text'] = page_train['sentences'].apply(clean_text)
page_dev['text'] = page_dev['sentences'].apply(clean_text)

In [None]:
page_train['text'] = page_train['sentences'].apply(clean_text)
page_dev['text'] = page_dev['sentences'].apply(clean_text)

In [None]:
page_test.to_parquet("data\\green_nlp_tasks\\lobbymap\\test.pkl")
page_train.to_parquet("data\\green_nlp_tasks\\lobbymap\\train.pkl")
page_dev.to_parquet("data\\green_nlp_tasks\\lobbymap\\dev.pkl")

In [None]:
import pandas as pd
page_test = pd.read_parquet("data\\green_nlp_tasks\\lobbymap\\test.pkl")

In [None]:
page_test[page_test['query'].isna()]

In [None]:
n_document_id = 10
subset_document_id = []
for q in page_test['query'].unique():
    if not q:
        _doc_id = page_test[page_test['query'].isna()]['document_id'].drop_duplicates(keep="first").sample(n_document_id, replace=False, random_state=42).unique()
    elif len(page_test[page_test['query'] == q]) <= n_document_id:
        _doc_id = page_test[page_test['query'] == q]['document_id'].unique()
    else:
        _doc_id = page_test[page_test['query'] == q]['document_id'].drop_duplicates(keep="first").sample(n_document_id, replace=False, random_state=42).unique()
    subset_document_id.append(_doc_id)
subset_document_id = [item for sublist in subset_document_id for item in sublist]

In [None]:
p_test = page_test[page_test['document_id'].isin(subset_document_id)].copy()
s_test = p_test[['document_id', 'page_id', 'text', "query", "stance"]].copy()
p_test = p_test[['document_id', 'page_id', 'text', "query"]].copy()
q_test = p_test[['document_id', 'page_id', 'text', "query"]].copy()

p_test['label'] = ~p_test['query'].isna()

q_test_origin = q_test.copy()
q_test = q_test[~q_test['query'].isna()]

s_test_origin = s_test.copy()
s_test = s_test[~s_test['query'].isna()]

p_test[['document_id', 'page_id', 'text', 'label']].to_parquet("data\\llm_green_nlp_tasks\\lobbymap_pages.pkl")
q_test = q_test.groupby(['document_id', 'page_id', 'text'])['query'].apply(lambda x: [e for e in x]).reset_index()
q_test[['document_id', 'page_id', 'text', 'query']].to_parquet("data\\llm_green_nlp_tasks\\lobbymap_query.pkl")
s_test[['document_id', 'page_id', 'text', 'query', 'stance']].to_parquet("data\\llm_green_nlp_tasks\\lobbymap_stance.pkl")

q_test_origin = q_test_origin.groupby(['document_id', 'page_id', 'text'])['query'].apply(lambda x: [e for e in x]).reset_index()
q_test_origin[['document_id', 'page_id', 'text', 'query']].to_parquet("data\\llm_green_nlp_tasks\\lobbymap_query_origin.pkl")
s_test_origin[['document_id', 'page_id', 'text', 'query', 'stance']].to_parquet("data\\llm_green_nlp_tasks\\lobbymap_stance_origin.pkl")

In [None]:
p_test = page_test.copy()
s_test = p_test[['document_id', 'page_id', 'text', "query", "stance"]].copy()
p_test = p_test[['document_id', 'page_id', 'text', "query"]].copy()
q_test = p_test[['document_id', 'page_id', 'text', "query"]].copy()

p_test['label'] = ~p_test['query'].isna()

q_test_origin = q_test.copy()
q_test = q_test[~q_test['query'].isna()]

s_test_origin = s_test.copy()
s_test = s_test[~s_test['query'].isna()]

p_test[['document_id', 'page_id', 'text', 'label']].to_parquet("data\\green_nlp_tasks\\lobbymap_pages.pkl")
q_test = q_test.groupby(['document_id', 'page_id', 'text'])['query'].apply(lambda x: [e for e in x]).reset_index()
q_test[['document_id', 'page_id', 'text', 'query']].to_parquet("data\\green_nlp_tasks\\lobbymap\\lobbymap_query.pkl")
s_test[['document_id', 'page_id', 'text', 'query', 'stance']].to_parquet("data\\green_nlp_tasks\\lobbymap\\lobbymap_stance.pkl")

q_test_origin = q_test_origin.groupby(['document_id', 'page_id', 'text'])['query'].apply(lambda x: [e for e in x]).reset_index()
q_test_origin[['document_id', 'page_id', 'text', 'query']].to_parquet("data\\green_nlp_tasks\\lobbymap\\lobbymap_query_origin.pkl")
s_test_origin[['document_id', 'page_id', 'text', 'query', 'stance']].to_parquet("data\\green_nlp_tasks\\lobbymap\\lobbymap_stance_origin.pkl")