# Make sure the dataset from huggingface is also not clean for climateQA

In [None]:
import pandas as pd
import os
from datasets import load_dataset

In [None]:
path = os.path.join(os.getcwd(), "data/non-dataset/climabench/all_data/CDP/Corporations/Corporations Responses/Climate Change/2018_Full_Climate_Change_Dataset.csv") 
df = pd.read_csv(path)

# Create a new dataset

In [None]:
import pandas as pd
completed_df = pd.concat([
    pd.read_csv(os.path.join(os.getcwd(), "data/non-dataset/climabench/all_data/CDP/Corporations/Corporations Responses/Climate Change/test_qa.csv")),
    pd.read_csv(os.path.join(os.getcwd(), "data/non-dataset/climabench/all_data/CDP/Corporations/Corporations Responses/Climate Change/train_qa.csv")),
    pd.read_csv(os.path.join(os.getcwd(), "data/non-dataset/climabench/all_data/CDP/Corporations/Corporations Responses/Climate Change/val_qa.csv"))])

In [None]:
filtered_df = df[df['response_value'].isin(completed_df['answer'])]
filtered_df = filtered_df[~filtered_df.duplicated(subset=['question_unique_reference', 'response_value'], keep="first")]

In [None]:
# Train, test, dev split

from sklearn.model_selection import train_test_split

seed = 42
selftrain_test_split = 0.2
selftest_dev_split = 0.5

train, temp = train_test_split(filtered_df, test_size=selftrain_test_split, random_state=seed, shuffle=True)
dev, test = train_test_split(temp, test_size=selftest_dev_split, random_state=seed, shuffle=True)

In [None]:
import random

# Function to generate false answers
def generate_false_answers(df):
    
    false_answers = []
    
    for i, r in df.iterrows():
        answer = r['response_value']
        question = r['question_unique_reference']
        
        found = False
        while not found:
            sample = df.sample(1)
            
            found = True
                        
            if sample['response_value'].values[0] == answer:
                found = False
                continue
                
            if sample['question_unique_reference'].values[0] == question:
                found = False
                continue
                
            if question in df[df['response_value'] == sample['response_value'].values[0]]['question_unique_reference'].unique().tolist():
                found = False
                continue
                
        false_answers += [sample['response_value'].values[0]]
    
    df['false_response_value'] = false_answers
    return df

# Generate the false answers
train = generate_false_answers(train)
test = generate_false_answers(test)
dev = generate_false_answers(dev)


In [None]:
train[['question_unique_reference', 'response_value', 'false_response_value']].to_parquet("data/Climabench/ClimaQA/CustomDataset/train.pkl")
test[['question_unique_reference', 'response_value', 'false_response_value']].to_parquet("data/Climabench/ClimaQA/CustomDataset/test.pkl")
dev[['question_unique_reference', 'response_value', 'false_response_value']].to_parquet("data/Climabench/ClimaQA/CustomDataset/dev.pkl")

In [None]:
def explode_df(df):
    df_1 = df[['question_unique_reference', 'response_value']].copy()
    df_1.columns = ["query", "text"]
    df_1['label'] = 1
    
    df_2 = df[['question_unique_reference', 'false_response_value']].copy()
    df_2.columns = ["query", "text"]
    df_2['label'] = 0
    
    return pd.concat([
        df_1,
        df_2
    ])

explode_df(train).to_parquet("data/Climabench/ClimaQA/CustomDataset/train.pkl")
explode_df(test).to_parquet("data/Climabench/ClimaQA/CustomDataset/test.pkl")
explode_df(dev).to_parquet("data/Climabench/ClimaQA/CustomDataset/dev.pkl")

# Create a new dataset for ClimaINS


In [None]:
from experiment import load_dataset
import pandas as pd

In [None]:
train, test, dev = load_dataset("ClimaINS")

In [None]:
full_dataset = pd.concat([train, test, dev], ignore_index=True)

In [None]:
len(full_dataset)

In [None]:
full_dataset = full_dataset.drop_duplicates()
len(full_dataset)

In [None]:
len(full_dataset[full_dataset['token_counts']<5])

In [None]:
full_dataset = full_dataset[full_dataset['token_counts']>=5].copy()
len(full_dataset)

In [None]:
full_dataset = full_dataset.drop_duplicates(subset=['clean_text', 'label'])
len(full_dataset)

In [None]:
full_dataset = full_dataset.drop_duplicates(subset=['clean_text'], keep=False)
len(full_dataset)

In [None]:
from sklearn.model_selection import train_test_split
train_test_split_frac = 0.8
test_dev_split_frac = 0.5
seed = 42

temp, train = train_test_split(full_dataset, test_size=train_test_split_frac, random_state=seed, shuffle=True)
dev, test = train_test_split(temp, test_size=test_dev_split_frac, random_state=seed, shuffle=True)

In [None]:
train.to_parquet('data/green_nlp_tasks/ClimaINS/train.pkl')
test.to_parquet('data/green_nlp_tasks/ClimaINS/test.pkl')
dev.to_parquet('data/green_nlp_tasks/ClimaINS/dev.pkl')

## New dataset created froms scratch

In [None]:
import pandas as pd
import os
from src.builder import clean_text

raw_data = pd.DataFrame()

for file in os.listdir("data/non-dataset/climabench/all_data/ClimateInsurance/raw/"):
    
    df = pd.read_csv(f"data/non-dataset/climabench/all_data/ClimateInsurance/raw/{file}", encoding="latin-1")

    for i in range(0, 9):
        if f"Question {i}" in df.columns:
            answers = df[[f"Question {i}", "Year", "Company Name"]].copy()
            answers.rename(columns={f"Question {i}":"text"}, inplace=True)
            answers['label'] = i
            break

    raw_data = pd.concat([raw_data, answers], ignore_index=True)
    
raw_data.sort_values(by="Year", ascending=False, inplace=True)
raw_data['text'] = raw_data['text'].apply(clean_text)

In [None]:
print("raw dataset", len(raw_data))
print("duplicates by year", len(raw_data[raw_data.duplicated(subset=['text', 'Company Name', 'label'], keep="first")]))
print("duplicates by company name", len(raw_data[raw_data.duplicated(subset=['text', 'label', 'Year'], keep="first")]))

In [None]:
raw_data_1 = raw_data.drop_duplicates(subset=["text"], keep=False).copy()

In [None]:
import pandas as pd
import re

import pandas as pd
import os
from src.builder import clean_text

raw_data = pd.DataFrame()

for file in os.listdir("data/non-dataset/climabench/all_data/ClimateInsurance/raw/"):
    
    df = pd.read_csv(f"data/non-dataset/climabench/all_data/ClimateInsurance/raw/{file}", encoding="latin-1")

    for i in range(0, 9):
        if f"Question {i}" in df.columns:
            answers = df[[f"Question {i}", "Year", "Company Name"]].copy()
            answers.rename(columns={f"Question {i}":"text"}, inplace=True)
            answers['label'] = i
            break

    raw_data = pd.concat([raw_data, answers], ignore_index=True)
    
raw_data.sort_values(by="Year", ascending=False, inplace=True)
raw_data['text'] = raw_data['text'].apply(clean_text)

raw_data_1 = raw_data.drop_duplicates(subset=["text"], keep=False).copy()

def remove_company_name(row):
    text = row['text']
    company_name = row['Company Name']
    # Escape special characters in company name and compile regex pattern (case-insensitive)
    pattern = re.compile(re.escape(company_name), re.IGNORECASE)
    # Substitute the company name with an empty string
    cleaned_text = pattern.sub('', text)
    return cleaned_text.strip()

# Apply the function to create a new column with the cleaned text
raw_data_1['cleaned_text'] = raw_data_1.apply(remove_company_name, axis=1)

import string

def normalize_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Check for duplicates in the cleaned text
raw_data_1['cleaned_text'] = raw_data_1['cleaned_text'].apply(lambda x: normalize_text(x))
duplicates = raw_data_1[raw_data_1.duplicated('cleaned_text', keep=False)]

# Display the duplicates
duplicates[['cleaned_text', 'text', 'Year', 'Company Name']]

In [None]:
from Levenshtein import distance
from itertools import combinations
import pandas as pd

# Initialize a list to store the results
results = []

# Loop over each unique company
for c in raw_data_1['Company Name'].unique():
    # Filter data for the current company
    company_data = raw_data_1[raw_data_1['Company Name'] == c]
    
    # Extract texts, indices, labels, and years
    texts = company_data['text'].tolist()
    indices = company_data.index.tolist()
    labels = company_data['label'].tolist()  # Assuming 'label' is the column name
    years = company_data['Year'].tolist()    # Assuming 'Year' is the column name
    
    # Combine indices, texts, labels, and years into a list of tuples
    data_tuples = list(zip(indices, texts, labels, years))
    
    # Generate all unique pairs of data
    for (idx1, text1, label1, year1), (idx2, text2, label2, year2) in combinations(data_tuples, 2):
        # Calculate the Levenshtein distance between the pair of texts
        dist = distance(text1, text2)
        
        # Store the results
        results.append({
            'Company Name': c,
            'Index1': idx1,
            'Index2': idx2,
            'Label1': label1,
            'Label2': label2,
            'Year1': year1,
            'Year2': year2,
            'Text1': text1,
            'Text2': text2,
            'Distance': dist
        })

# Convert the results into a DataFrame for easier analysis
results_df = pd.DataFrame(results)

# Optional: Sort the DataFrame by distance to find the closest matches
results_df = results_df.sort_values(by='Distance')

# Display the closest matches
print(results_df.head())


# Create a dataset for climaTOPIC


In [None]:
from experiment import load_dataset
import pandas as pd
train, test, dev = load_dataset("ClimaTOPIC")


In [None]:
train.merge(test, how="inner", on="clean_text", suffixes=("_train", "_test"))

In [None]:
full_dataset = pd.concat([train, test, dev], ignore_index=True)

# Create Multilabel dataset

In [None]:
from src.builder import DatasetBuilder

logger = generator.logger
args = generator.args

builder = DatasetBuilder()

In [None]:
train, test, dev = builder.lobbymap_query()

In [None]:
train.to_parquet("data/lobbymap/lobbymap_dataset/train_query.pkl")
test.to_parquet("data/lobbymap/lobbymap_dataset/test_query.pkl")
dev.to_parquet("data/lobbymap/lobbymap_dataset/dev_query.pkl")

In [None]:
train = pd.read_csv("data/logicclimate/climate_train.csv")
test = pd.read_csv("data/logicclimate/climate_test.csv")
dev = pd.read_csv("data/logicclimate/climate_dev.csv")


In [None]:
train = train.dropna(subset=['source_article'])
test = test.dropna(subset=['source_article'])
dev = dev.dropna(subset=['source_article'])

In [None]:
train = train.groupby("source_article")['logical_fallacies'].agg(lambda x: list(x)).reset_index()
test = test.groupby("source_article")['logical_fallacies'].agg(lambda x: list(x)).reset_index()
dev = dev.groupby("source_article")['logical_fallacies'].agg(lambda x: list(x)).reset_index()

In [None]:
train.to_csv("data/logicclimate/multi_train.csv")
test.to_csv("data/logicclimate/multi_test.csv")
dev.to_csv("data/logicclimate/multi_dev.csv")