Split train-test

In [1]:
import os
import pandas as pd
from utils.preprocessing_text import Preprocess
from sklearn.model_selection import train_test_split


def create_folder(path):
    try:
        os.makedirs(path)
        print(f"Folder --> '{path}' created")
    except FileExistsError:
        print(f"Already exist --> '{path}'")
        
def string_to_txt(path, file_name, string):
    if not file_name.endswith('.txt'):
        file_name += '.txt'
        
    file_path = f"{path}/{file_name}"

    with open(file_path, 'w') as file:
        file.write(string)

def split_by_label(df, reduce_algebra=None, reduce_combin=None, reduce_geomet=None, reduce_nt=None, is_truncate=False):
    algebra = df[df['label'] == 'Algebra'][['id_key', 'post_canonical', 'label']]
    combin = df[df['label'] == 'Combinatorics'][['id_key', 'post_canonical', 'label']]
    geomet = df[df['label'] == 'Geometry'][['id_key', 'post_canonical', 'label']]
    nt = df[df['label'] == 'Number Theory'][['id_key', 'post_canonical', 'label']]
    
    # Kalau mau disamain semua
    if is_truncate:
        m = min([algebra.shape[0], combin.shape[0], geomet.shape[0], nt.shape[0]])
        algebra = algebra.sample(m, random_state=42)
        combin = combin.sample(m, random_state=42)
        geomet = geomet.sample(m, random_state=42)
        nt = nt.sample(m, random_state=42)

        return algebra, combin, geomet, nt
    
    if reduce_algebra is not None: algebra=algebra.sample(algebra.shape[0]-reduce_algebra, random_state=42)
    if reduce_combin is not None: combin=combin.sample(combin.shape[0]-reduce_combin, random_state=42)
    if reduce_geomet is not None: geomet=geomet.sample(geomet.shape[0]-reduce_geomet, random_state=42)
    if reduce_nt is not None: nt=nt.sample(nt.shape[0]-reduce_nt, random_state=42)
    
    return algebra, combin, geomet, nt

df = pd.read_csv("../data/classification/imo.csv")
df = df[df['label'].notna()]

df['post_canonical'] = df['post_rendered'].apply(Preprocess().fit)
print(f"\nShape: {df.shape}")





Shape: (3619, 10)


In [2]:
test_size=0.2
algebra, combin, geomet, nt = split_by_label(df, reduce_geomet=300)

temp1=[algebra.shape[0], combin.shape[0], geomet.shape[0], nt.shape[0]]
temp2=[round(temp1[0]*(1-test_size)), round(temp1[1]*(1-test_size)), round(temp1[2]*(1-test_size)), round(temp1[3]*(1-test_size))]
temp3=[round(temp1[0]*test_size), round(temp1[1]*test_size), round(temp1[2]*test_size), round(temp1[3]*test_size)]
print(f"               {'n':>7s} {'train':>7s} {'test':>7s}")
print(f"Algebra       :{str(temp1[0]):>7s} {str(temp2[0]):>7s} {str(temp3[0]):>7s}")
print(f"Combinatorics :{str(temp1[1]):>7s} {str(temp2[1]):>7s} {str(temp3[1]):>7s}")
print(f"Geometry      :{str(temp1[2]):>7s} {str(temp2[2]):>7s} {str(temp3[2]):>7s}")
print(f"NT            :{str(temp1[3]):>7s} {str(temp2[3]):>7s} {str(temp3[3]):>7s}")
print("_________________________________________+")
print(f"Total         :{str(sum(temp1)):>7s} {str(sum(temp2)):>7s} {str(sum(temp3)):>7s}")


                     n   train    test
Algebra       :    815     652     163
Combinatorics :    737     590     147
Geometry      :    972     778     194
NT            :    795     636     159
_________________________________________+
Total         :   3319    2656     663


In [3]:
algebra_combin_geomet_nt = [algebra, combin, geomet, nt]

train_path = "../data/classification/train"
test_path = "../data/classification/test"
create_folder(train_path)
create_folder(test_path)

for each in algebra_combin_geomet_nt:
    train, test = train_test_split(each, test_size=test_size, random_state=42)
    
    current_label = str(list(each['label'])[0])
    current_path_train, current_path_test = f"{train_path}/{current_label}", f"{test_path}/{current_label}"
    create_folder(current_path_train)
    create_folder(current_path_test)
    
    train = train.to_dict('records')
    for i in range(len(train)):
        id_key = str(train[i]['id_key'])
        string = train[i]['post_canonical']
        string_to_txt(
            path=current_path_train, 
            file_name=id_key, 
            string=string
        )

    test = test.to_dict('records')
    for j in range(len(test)):
        id_key = str(test[j]['id_key'])
        string = test[j]['post_canonical']
        string_to_txt(
            path=current_path_test, 
            file_name=id_key, 
            string=string
        )


Folder --> '../data/classification/train' created
Folder --> '../data/classification/test' created
Folder --> '../data/classification/train/Algebra' created
Folder --> '../data/classification/test/Algebra' created
Folder --> '../data/classification/train/Combinatorics' created
Folder --> '../data/classification/test/Combinatorics' created
Folder --> '../data/classification/train/Geometry' created
Folder --> '../data/classification/test/Geometry' created
Folder --> '../data/classification/train/Number Theory' created
Folder --> '../data/classification/test/Number Theory' created
