<a href="https://colab.research.google.com/github/vondersam/sdgs_text_classifier/blob/master/experiments/data_partition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install iterative-stratification

Collecting iterative-stratification
  Downloading https://files.pythonhosted.org/packages/9d/79/9ba64c8c07b07b8b45d80725b2ebd7b7884701c1da34f70d4749f7b45f9a/iterative_stratification-0.1.6-py3-none-any.whl
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.6


In [0]:
import numpy as np
import pandas as pd 
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold, MultilabelStratifiedShuffleSplit
from sklearn.model_selection import ShuffleSplit, train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from pathlib import Path
import os

In [3]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
base_dir = "gdrive/My Drive/fastai-v3/sdgs/dataset/"
labelled_dataset = base_dir + "cleanup_labelled.csv"
output_dir = "gdrive/My Drive/fastai-v3/sdgs/dataset/cross_validation/"

Mounted at /content/gdrive


In [129]:
data_df = pd.read_csv(labelled_dataset)
data_df.labels = data_df.labels.str.split('|').apply(lambda x: [int(i) for i in x])


mskf = MultilabelStratifiedKFold(n_splits=5, random_state=0)
mlb = MultiLabelBinarizer()
count = 0

x = data_df[['text']].values # text
y = mlb.fit_transform(data_df.labels) # labels

for train_index, test_index in mskf.split(x, y):
    count += 1
    print(f"Fold no. {count}")
    fold_dir = Path(f"{output_dir}fold_{count}/")
    fold_dir.mkdir(exist_ok=True)
    np.save(fold_dir/"train", train_index)
    np.save(fold_dir/"test", test_index)

Fold no. 1
Fold no. 2
Fold no. 3
Fold no. 4
Fold no. 5


In [0]:
def get_indices(original_arr, new_arr):
    results = []
    for text in new_arr:
        results.append(np.where(x == text)[0][0])
    return np.array(results)

In [184]:
data_df = pd.read_csv(labelled_dataset)
data_df.labels = data_df.labels.str.split('|').apply(lambda x: [int(i) for i in x])


mskf = MultilabelStratifiedShuffleSplit(n_splits=5, random_state=0, test_size=0.1)
mlb = MultiLabelBinarizer()
count = 0

x = data_df[['text']].values # text
y = mlb.fit_transform(data_df.labels) # labels

for original_train_index, test_index in mskf.split(x, y):
    count += 1
    print(f"Fold no. {count}")
    fold_dir = Path(f"{output_dir}fold_{count}/")
    fold_dir.mkdir(exist_ok=True)
    
    train_x = x[original_train_index]
    train, val = train_test_split(train_x, test_size=0.11)
    train_index = get_indices(x, train)
    val_index = get_indices(x, val)

    
    np.save(fold_dir/"train", train_index)
    np.save(fold_dir/"val", val_index)
    np.save(fold_dir/"test", test_index)


    train_val = len(set(train_index) & set(val_index))
    train_test = len(set(train_index) & set(test_index))
    val_test = len(set(val_index) & set(test_index))
    print(f"Overlapping train & val: {train_val != 0}")
    print(f"Overlapping train & test: {train_test != 0}")
    print(f"Overlapping val & test: {val_test != 0}")
    print()
    print(f"Train size: {len(train_index)}")
    print(f"Val size: {len(val_index)}")
    print(f"Test size: {len(test_index)}")
    print(f"Total: {len(train_index)+len(val_index)+len(test_index)}")
    print("______________")

Fold no. 1
Overlapping train & val: False
Overlapping train & test: False
Overlapping val & test: False

Train size: 4173
Val size: 516
Test size: 493
Total: 5182
______________
Fold no. 2
Overlapping train & val: False
Overlapping train & test: False
Overlapping val & test: False

Train size: 4151
Val size: 514
Test size: 517
Total: 5182
______________
Fold no. 3
Overlapping train & val: False
Overlapping train & test: False
Overlapping val & test: False

Train size: 4142
Val size: 513
Test size: 527
Total: 5182
______________
Fold no. 4
Overlapping train & val: False
Overlapping train & test: False
Overlapping val & test: False

Train size: 4140
Val size: 512
Test size: 530
Total: 5182
______________
Fold no. 5
Overlapping train & val: False
Overlapping train & test: False
Overlapping val & test: False

Train size: 4155
Val size: 514
Test size: 513
Total: 5182
______________


In [0]:
# Load data
my_data = np.load(fold_dir/"train.npy")