In [1]:
import pandas as pd
import csv

# Load the dataset
df1 = pd.read_csv('/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/badword-on-hatespeech-en/id_translated/negative_hatespeech_with_sexual_words_id_translated_downscalled.csv', quoting=csv.QUOTE_ALL)
df2 = pd.read_csv('/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/badword-on-hatespeech-en/id_translated/negative_hatespeech_without_sexual_words_id_translated_downscalled.csv', quoting=csv.QUOTE_ALL)
df3 = pd.read_csv('/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/badword-on-hatespeech-en/id_translated/positive_hatespeech_with_sexual_words_id_translated_downscalled.csv', quoting=csv.QUOTE_ALL)
df4 = pd.read_csv('/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/badword-on-hatespeech-en/id_translated/positive_hatespeech_without_sexual_words_id_translated_downscalled.csv', quoting=csv.QUOTE_ALL)

# Keep only the relevant columns (indonesian text and hs_class)
df1 = df1[['text_translated', 'hs_class']]
df2 = df2[['text_translated', 'hs_class']]
df3 = df3[['text_translated', 'hs_class']]
df4 = df4[['text_translated', 'hs_class']]

# Rename columns to match the desired format
df1 = df1.rename(columns={'text_translated': 'text'})
df2 = df2.rename(columns={'text_translated': 'text'})
df3 = df3.rename(columns={'text_translated': 'text'})
df4 = df4.rename(columns={'text_translated': 'text'})

# Remove internal quotes if any
df1['text'] = df1['text'].str.replace('"', '', regex=False)
df2['text'] = df2['text'].str.replace('"', '', regex=False)
df3['text'] = df3['text'].str.replace('"', '', regex=False)
df4['text'] = df4['text'].str.replace('"', '', regex=False)

# Filter out rows where text starts with "Bahasa Indonesia"
df1 = df1[~df1['text'].str.startswith("Bahasa Indonesia")]
df2 = df2[~df2['text'].str.startswith("Bahasa Indonesia")]
df3 = df3[~df3['text'].str.startswith("Bahasa Indonesia")]
df4 = df4[~df4['text'].str.startswith("Bahasa Indonesia")]

# Save the new CSV with the correct format
df1.to_csv('negative_hatespeech_with_sexual_words_id_translated_downscalled_formatted.csv', index=False)
df2.to_csv('negative_hatespeech_without_sexual_words_id_translated_downscalled_formatted.csv', index=False)
df3.to_csv('positive_hatespeech_with_sexual_words_id_translated_downscalled_formatted.csv', index=False)
df4.to_csv('positive_hatespeech_without_sexual_words_id_translated_downscalled_formatted.csv', index=False)

print("Dataset has been formatted, filtered, and saved.")

Dataset has been formatted, filtered, and saved.


In [3]:
import pandas as pd
import csv

# Define file paths and output paths for each CSV
file_paths = [
    '/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/badword-on-hatespeech-en/multilang-cross-val/id_from_en_downscalled/negative_hatespeech_with_sexual_words_id_translated_downscalled_formatted.csv',
    '/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/badword-on-hatespeech-en/multilang-cross-val/id_from_en_downscalled/negative_hatespeech_without_sexual_words_id_translated_downscalled_formatted.csv',
    '/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/badword-on-hatespeech-en/multilang-cross-val/id_from_en_downscalled/positive_hatespeech_with_sexual_words_id_translated_downscalled_formatted.csv',
    '/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/badword-on-hatespeech-en/multilang-cross-val/id_from_en_downscalled/positive_hatespeech_without_sexual_words_id_translated_downscalled_formatted.csv',
    '/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/badword-on-hatespeech-en/multilang-cross-val/id_from_en_downscalled/enhanced_positive_hatespeech.csv',
    '/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/badword-on-hatespeech-en/multilang-cross-val/id_from_en_downscalled/softened_negative_hatespeech.csv'
]

output_paths = [
    '/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/badword-on-hatespeech-en/multilang-cross-val/id_from_en_downscalled/limited5000/negative_hatespeech_with_sexual_words_limited.csv',
    '/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/badword-on-hatespeech-en/multilang-cross-val/id_from_en_downscalled/limited5000/negative_hatespeech_without_sexual_words_limited.csv',
    '/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/badword-on-hatespeech-en/multilang-cross-val/id_from_en_downscalled/limited5000/positive_hatespeech_with_sexual_words_limited.csv',
    '/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/badword-on-hatespeech-en/multilang-cross-val/id_from_en_downscalled/limited5000/positive_hatespeech_without_sexual_words_limited.csv',
    '/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/badword-on-hatespeech-en/multilang-cross-val/id_from_en_downscalled/limited5000/enhanced_positive_hatespeech_limited.csv',
    '/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/badword-on-hatespeech-en/multilang-cross-val/id_from_en_downscalled/limited5000/softened_negative_hatespeech_limited.csv'
]

# Process each file individually
for file_path, output_path in zip(file_paths, output_paths):
    # Load the data
    df = pd.read_csv(file_path, quoting=csv.QUOTE_ALL)
    
    # If the dataframe has more than 5000 rows, sample or truncate to 5000 rows
    if len(df) > 5000:
        df = df.sample(n=5000, random_state=42)  # random_state for reproducibility

    # Save the new truncated or sampled CSV
    df.to_csv(output_path, index=False, quoting=csv.QUOTE_ALL)
    print(f"Saved truncated CSV as '{output_path}' with a maximum of 5000 rows.")


Saved truncated CSV as '/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/badword-on-hatespeech-en/multilang-cross-val/id_from_en_downscalled/limited5000/negative_hatespeech_with_sexual_words_limited.csv' with a maximum of 5000 rows.
Saved truncated CSV as '/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/badword-on-hatespeech-en/multilang-cross-val/id_from_en_downscalled/limited5000/negative_hatespeech_without_sexual_words_limited.csv' with a maximum of 5000 rows.
Saved truncated CSV as '/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/badword-on-hatespeech-en/multilang-cross-val/id_from_en_downscalled/limited5000/positive_hatespeech_with_sexual_words_limited.csv' with a maximum of 5000 rows.
Saved truncated CSV as '/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/badword-on-hatespeech-en/multilang-cross-val/id_from_en_downscalled/limited5000/positive_hatespeech_without_sexual_words_limited.csv' with a maximum of 5000 rows.
Saved trun