In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler


In [2]:
path_label_dir = Path.cwd().parent.parent / "data/processed/labels/labels_complete"
print(path_label_dir)

/home/tim/Documents/arxiv-code-search/data/processed/labels/labels_complete


In [53]:
df = pd.read_csv(path_label_dir / "labels.csv", dtype={"id": str})
df.head()

Unnamed: 0,id,pattern,token_count,update_date,label,para
0,1710.02907,"data, dataset",280,2022-04-21,0,"Experiment 2: In this set of experiments, we e..."
1,1811.11012,data,195,2022-04-21,0,This section of the technical report is focuse...
2,1811.11012,"data, dataset",70,2022-04-21,0,volunteers’ vehicles were mounted with BSM-bro...
3,1912.09582,dataset,13,2022-04-21,0,for small datasets–a case with Dutch book revi...
4,1912.09582,dataset,15,2022-04-21,1,Table 4: Sentiment Analysis accuracy scores on...


In [7]:
# make new column y, where value is 1 if label > 0, 0 otherwise
df["y"] = df["label"].apply(lambda x: 1 if x > 0 else 0)
df["y"].unique()

array([0, 1])

In [8]:
# save df to csv
df.to_csv("labels_complete_test.csv", index=False)

In [22]:
df_train, df_val = train_test_split(df, test_size=0.4, random_state=12, stratify=df['y']) # TO-DO: add stratification, and select by date
a, b =df_train["y"].value_counts()
print(a, b)

# get percentages of each class in each split
print(df_train["y"].value_counts() / len(df_train) * 100)

df_val["y"].value_counts() / len(df_val) * 100

a, b =df_train["y"].value_counts()
print(a, b)

700 50
0    93.333333
1     6.666667
Name: y, dtype: float64
700 50


In [18]:
ros = RandomOverSampler(sampling_strategy=0.5, random_state=0)
df_train, _ = ros.fit_resample(df_train, df_train['y'])
print(df_train["y"].value_counts() / len(df_train) * 100)
print(df_train["y"].value_counts())

0    66.666667
1    33.333333
Name: y, dtype: float64
0    700
1    350
Name: y, dtype: int64


In [23]:
def under_over_sampler(x, y, method=None, ratio=0.5):
    """
    Returns an undersampled or oversampled data set. Implemented using imbalanced-learn package.
    ['random_over','random_under','random_under_bootstrap','smote', 'adasyn']
    """

    if method == None:
        return x, y

    # oversample methods: https://imbalanced-learn.readthedocs.io/en/stable/over_sampling.html
    elif method == "random_over":
        # print('before:',sorted(Counter(y).items()))
        ros = RandomOverSampler(sampling_strategy=ratio, random_state=0)
        x_resampled, y_resampled = ros.fit_resample(x, y)
        # print('after:',sorted(Counter(y_resampled).items()))
        return x_resampled, y_resampled

    elif method == "random_under":
        rus = RandomUnderSampler(sampling_strategy=ratio, random_state=0)
        x_resampled, y_resampled = rus.fit_resample(x, y)
        return x_resampled, y_resampled

    elif method == "random_under_bootstrap":
        rus = RandomUnderSampler(
            sampling_strategy=ratio, random_state=0, replacement=True
        )
        x_resampled, y_resampled = rus.fit_resample(x, y)
        return x_resampled, y_resampled

    elif method == "smote":
        x_resampled, y_resampled = SMOTE(
            sampling_strategy=ratio, random_state=0
        ).fit_resample(x, y)
        return x_resampled, y_resampled

    elif method == "adasyn":
        x_resampled, y_resampled = ADASYN(
            sampling_strategy=ratio, random_state=0
        ).fit_resample(x, y)
        return x_resampled, y_resampled

    else:
        return x, y

In [37]:
df_train, df_val = train_test_split(df, test_size=0.4, random_state=12, stratify=df['y']) # TO-DO: add stratification, and select by date
counts = df_train["y"].value_counts()
print(f"df_train counts: {counts[0]}, {counts[1]}")
counts = df_val["y"].value_counts()
print(f"df_val counts: {counts[0]}, {counts[1]}")

df_train, _ = under_over_sampler(df_train, df_train['y'], method="random_over", ratio=0.3)
print(df_train["y"].value_counts())

df_train, _ = under_over_sampler(df_train, df_train['y'], method="random_under", ratio=0.6)
print(df_train["y"].value_counts())

df_train counts: 700, 50
df_val counts: 467, 33
0    700
1    210
Name: y, dtype: int64
0    350
1    210
Name: y, dtype: int64


## Split by ID

In [3]:
df = pd.read_csv(path_label_dir / "labels.csv", dtype={"id": str})
df["y"] = df["label"].apply(lambda x: 1 if x > 0 else 0)
ids = df["id"].unique()

In [9]:
train_ids, val_ids = train_test_split(ids, test_size=0.4, random_state=13,)

df_train = df[df['id'].isin(train_ids)]
counts = df_train["y"].value_counts()
print(f"df_train counts: {counts[0]}, {counts[1]}")

df_val = df[df['id'].isin(val_ids)]
counts = df_val["y"].value_counts()
print(f"df_val counts: {counts[0]}, {counts[1]}")

df_train counts: 920, 60
df_val counts: 247, 23
