### Loading the Data

In [19]:
import pandas as pd 
from sklearn.model_selection import train_test_split

df = pd.read_csv("/content/train.csv")

train, test = train_test_split(df, test_size = 0.2, random_state = 42)
train, validation = train_test_split(train, test_size = 0.2, random_state = 42)

In [20]:
len(train), len(validation), len(test)

(4872, 1218, 1523)

In [21]:
test.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [22]:
disaster_proportion = test["target"].sum()/len(test)
print(f"proportion => disaster is {disaster_proportion}")

proportion => disaster is 0.4261326329612607


In [23]:
from imblearn.under_sampling import RandomUnderSampler

# create an instance of the RandomUnderSampler class
undersample = RandomUnderSampler(random_state=42)

# separate the input features and target variable
X = test.drop('target', axis=1)
y = test['target']

# fit and apply the transform using the RandomUnderSampler
X_over, y_over = undersample.fit_resample(X, y)

In [24]:
# concatenate the undersampled X and y arrays
test_undersampled = pd.concat([X_over, y_over], axis=1)

# print the shape of the new DataFrame to verify the number of samples
print(test_undersampled.shape)

(1298, 5)


In [25]:
len(test_undersampled)

1298

In [27]:
disaster_proportion = test_undersampled["target"].sum()/len(test_undersampled)
print(f"proportion => disaster is {disaster_proportion}")

proportion => disaster is 0.5


In [30]:
train.to_csv('train_df.csv')
validation.to_csv("validation_df.csv")
test.to_csv("test_df.csv")
test_undersampled.to_csv("test_undersampled.csv")