Split train-test

In [1]:
import pandas as pd
from utils.preprocessing_text import Preprocess
from sklearn.model_selection import train_test_split


def split_and_save_data_for_regression_train_test_val_(df, path_to_save, test_size=0.2, val_size=0.1, random_state=42):
    train, test = train_test_split(df, test_size=test_size, random_state=random_state)
    train, val = train_test_split(train, test_size=val_size, random_state=random_state)
    
    train.to_csv(f"{path_to_save}train.csv", index=False)
    test.to_csv(f"{path_to_save}test.csv", index=False)
    val.to_csv(f"{path_to_save}val.csv", index=False)

    print(f"\nSuccessfully save in: '{path_to_save}'")
    print(f"{'n':>8s} {'train':>8s} {'test':>8s} {'val':>8s}")
    print(f"{str(df.shape[0]):>8s} {str(train.shape[0]):>8s} {str(test.shape[0]):>8s} {str(val.shape[0]):>8s}")

def preapare_and_load_dataset(csv_file) ->pd.DataFrame:
    df = pd.read_csv(csv_file)
    df['post_canonical'] = df['post_rendered'].apply(Preprocess().fit)
    df = df[['post_canonical', 'score', 'level']]
    return df

apmo_combined =    preapare_and_load_dataset("../data/regression/apmo/apmo_combined.csv")
benelux_combined = preapare_and_load_dataset("../data/regression/benelux/benelux_combined.csv")
egmo_combined =    preapare_and_load_dataset("../data/regression/egmo/egmo_combined.csv")
imo_combined =     preapare_and_load_dataset("../data/regression/imo/imo_combined.csv")

combined_all =     preapare_and_load_dataset("../data/regression/combined_all.csv")



In [2]:
split_and_save_data_for_regression_train_test_val_(apmo_combined, "../data/regression/apmo/", test_size=0.2, val_size=0.1, random_state=42)
split_and_save_data_for_regression_train_test_val_(benelux_combined, "../data/regression/benelux/", test_size=0.2, val_size=0.1, random_state=42)
split_and_save_data_for_regression_train_test_val_(egmo_combined, "../data/regression/egmo/", test_size=0.2, val_size=0.1, random_state=42)
split_and_save_data_for_regression_train_test_val_(imo_combined, "../data/regression/imo/", test_size=0.2, val_size=0.1, random_state=42)

split_and_save_data_for_regression_train_test_val_(combined_all, "../data/regression/", test_size=0.2, val_size=0.1, random_state=42)


Successfully save in: '../data/regression/apmo/'
       n    train     test      val
      70       50       14        6

Successfully save in: '../data/regression/benelux/'
       n    train     test      val
      60       43       12        5

Successfully save in: '../data/regression/egmo/'
       n    train     test      val
      74       53       15        6

Successfully save in: '../data/regression/imo/'
       n    train     test      val
     384      276       77       31

Successfully save in: '../data/regression/'
       n    train     test      val
     588      423      118       47
