In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import resample
from imblearn.under_sampling import RandomUnderSampler

In [6]:
edf = pd.read_csv('/nas.dbms/fathan/test/multilang-hate-models/binary-hatespeech/final_preprocessed_data_yidong_devansh.csv', header = 0, names=['text', 'hs_class'])
# Ensure 'text' column is string type
edf['text'] = edf['text'].astype(str)

# Convert the 'hs_class' column to numeric, forcing errors to NaN
edf['hs_class'] = pd.to_numeric(edf['hs_class'], errors='coerce')

# Display rows with non-numeric 'hs_class' values
print("Rows with non-numeric 'hs_class' values:")
print(len(edf[edf['hs_class'].isna()]))

# Drop rows with NaN 'hs_class' values
edf.dropna(subset=['hs_class'], inplace=True)

# Convert 'hs_class' column to integer type
edf['hs_class'] = edf['hs_class'].astype(int)

# Map the 'hs_class' values: 1 -> 'positive', 0 -> 'negative'
edf['hs_class'] = edf['hs_class'].map({1: 'positive', 0: 'negative'})

# Create the undersampling object
undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)

# Separate the features (X) and the target (y)
X = edf['text'].values.reshape(-1, 1)  # Reshape required because RandomUnderSampler expects a 2D array
y = edf['hs_class']

# Perform the undersampling
X_resampled, y_resampled = undersampler.fit_resample(X, y)

# Create a new dataframe with the resampled data
edf = pd.DataFrame({'text': X_resampled.flatten(), 'hs_class': y_resampled})

edf.dropna(subset=['text'], inplace=True)
edf.info()
edf.sample(5)
edf['hs_class'].value_counts()

Rows with non-numeric 'hs_class' values:
7
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160498 entries, 0 to 160497
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   text      160498 non-null  object
 1   hs_class  160498 non-null  object
dtypes: object(2)
memory usage: 2.4+ MB


hs_class
negative    80249
positive    80249
Name: count, dtype: int64

In [7]:
idf = pd.read_csv('/nas.dbms/fathan/test/multilang-hate-models/data_preprocessed.csv', header = 0)
idf.dropna(subset=['text'], inplace=True)
idf.info()
idf.sample(5)

<class 'pandas.core.frame.DataFrame'>
Index: 11068 entries, 0 to 11102
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   hs_class  11068 non-null  object
 1   text      11068 non-null  object
dtypes: object(2)
memory usage: 259.4+ KB


Unnamed: 0,hs_class,text
10948,negative,antek antek nya wkwk malu ngehype wkwk
8566,negative,ya angkut kota goblok
4096,positive,mampus iya marah hyung 13 gue wkwk
5830,negative,laku homo jaman now ngentot live blued gay vid...
10117,negative,babi meeting jam sakit pinggang


In [8]:
# Ensure the order of the columns is consistent between the two dataframes
# Reorder columns if necessary
edf = edf[['text', 'hs_class']]
idf = idf[['text', 'hs_class']]

# Determine which dataframe is larger and which is smaller
edf_size = len(edf)
idf_size = len(idf)

# If df1 is larger, we will undersample it to match df2's size
if edf_size > idf_size:
    edf_downsampled = resample(edf, 
                               replace=False,  # Do not sample with replacement
                               n_samples=idf_size,  # Match df2's size
                               random_state=42)  # For reproducibility
    # Now we can concatenate the balanced dataframes
    df = pd.concat([edf_downsampled, idf], ignore_index=True)
else:
    idf_downsampled = resample(idf, 
                               replace=False,  
                               n_samples=edf_size,  
                               random_state=42)  
    # Now concatenate
    df = pd.concat([edf, idf_downsampled], ignore_index=True)

df.info()
df.sample(5)
df['hs_class'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22136 entries, 0 to 22135
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      22136 non-null  object
 1   hs_class  22136 non-null  object
dtypes: object(2)
memory usage: 346.0+ KB


hs_class
positive    11087
negative    11049
Name: count, dtype: int64