# Libraries

In [1]:
import os

import pandas as pd
import numpy as np

from distutils.dir_util import copy_tree
from sklearn.datasets import load_svmlight_file
from itertools import product

# Formatting Datasets (DONE)

In [2]:
# dir_datasets_raw = '../data/datasets/raw'
# dir_datasets_preprocessed = '../data/datasets/preprocessed'
# datasets = ['20ng', 'acm', 'agnews', 'imdb_reviews', 'reut', 'sogou', 'webkb', 'yahoo', 'yelp_2015']

# dir_input_old = '/home/christian/arquivado/projeto_stacking/input'

# for dset in datasets:
#     print(f'Copying dataset {dset}')
#     dir_raw_old = f'{dir_input_old}/{dset}/raw'
#     dir_raw_new = f'{dir_datasets_raw}/{dset}'
#     copy_tree(dir_raw_old, dir_raw_new)
    
#     dir_pre_old = f'{dir_input_old}/{dset}/preprocessed'
#     dir_pre_new = f'{dir_datasets_preprocessed}/{dset}'
#     copy_tree(dir_pre_old, dir_pre_new)

# Formatting Classification Input

In [3]:
datasets = ['20ng', 'acm', 'agnews', 'imdb_reviews', 'reut', 'sogou', 'webkb', 'yahoo', 'yelp_2015']
repr_ids = {
    "fast_text_1/meta_features_1/knn_cos": "fmk",
    "fast_text_1/raw_folds": "fr",
    "pte_1/meta_features_1/knn_cos": "pmk",
    "pte_1/raw_folds": "pr",
    "tf_idf_1/meta_features_1/knn_cos": "tmk",
    "tf_idf_1/fs": "tr"
}
representations = repr_ids.keys()

for dset, text_repr in product(datasets, representations):
    text_repr_id = repr_ids[text_repr]
    n_folds = 10 if dset in ['20ng', 'acm', 'reut', 'webkb'] else 5
    
    if (text_repr_id in ['fmk', 'pmk']) and (dset in ['agnews', 'imdb_reviews', 'sogou', 'yahoo', 'yelp_2015']):
        continue
    
    print(dset, text_repr, end=' ')
    for fold_id in range(n_folds):
        print(fold_id, end=' ')
        
        # New X,y train/test
        dir_cls_input = (
            f'/home/christian/stacking_text_classification/data/classification_input/'
            f'{dset}/{n_folds}_folds/{text_repr_id}/{fold_id}'
        )
        os.makedirs(dir_cls_input, exist_ok=True)
        train_new = f'{dir_cls_input}/train'
        test_new = f'{dir_cls_input}/test'
        
        if os.path.exists(f'{train_new}.npz') and os.path.exists(f'{test_new}.npz'):
            continue
            
        # Read old X/y train/test
        dir_input_old = (
            f'/home/christian/arquivado/projeto_stacking/input/{dset}/representations'
            f'/{n_folds}_folds/{text_repr}/fold_{fold_id}')
        test_old = f'{dir_input_old}/test.gz'
        train_old = f'{dir_input_old}/train.gz'
        
        X_train, y_train = load_svmlight_file(f=train_old, dtype=np.float64, zero_based=False)
        X_test, y_test = load_svmlight_file(f=test_old, dtype=np.float64, zero_based=False)

        if X_train.shape[1] > X_test.shape[1]:
            X_test, y_test = load_svmlight_file(f=test_old, dtype=np.float64, zero_based=False, 
                                                n_features=X_train.shape[1])
        elif X_train.shape[1] < X_test.shape[1]:
            X_train, y_train = load_svmlight_file(f=train_old, dtype=np.float64, zero_based=False, 
                                                  n_features=X_test.shape[1])
    
        # Save new X/y train/test
        np.savez_compressed(train_new, X_train=X_train, y_train=y_train)
        np.savez_compressed(test_new, X_test=X_test, y_test=y_test)
        
    print()

20ng fast_text_1/meta_features_1/knn_cos 0 1 2 3 4 5 6 7 8 9 
20ng fast_text_1/raw_folds 0 1 2 3 4 5 6 7 8 9 
20ng pte_1/meta_features_1/knn_cos 0 1 2 3 4 5 6 7 8 9 
20ng pte_1/raw_folds 0 1 2 3 4 5 6 7 8 9 
20ng tf_idf_1/meta_features_1/knn_cos 0 1 2 3 4 5 6 7 8 9 
20ng tf_idf_1/fs 0 1 2 3 4 5 6 7 8 9 
acm fast_text_1/meta_features_1/knn_cos 0 1 2 3 4 5 6 7 8 9 
acm fast_text_1/raw_folds 0 1 2 3 4 5 6 7 8 9 
acm pte_1/meta_features_1/knn_cos 0 1 2 3 4 5 6 7 8 9 
acm pte_1/raw_folds 0 1 2 3 4 5 6 7 8 9 
acm tf_idf_1/meta_features_1/knn_cos 0 1 2 3 4 5 6 7 8 9 
acm tf_idf_1/fs 0 1 2 3 4 5 6 7 8 9 
agnews fast_text_1/raw_folds 0 1 2 3 4 
agnews pte_1/raw_folds 0 1 2 3 4 
agnews tf_idf_1/meta_features_1/knn_cos 0 1 2 3 4 
agnews tf_idf_1/fs 0 1 2 3 4 
imdb_reviews fast_text_1/raw_folds 0 1 2 3 4 
imdb_reviews pte_1/raw_folds 0 1 2 3 4 
imdb_reviews tf_idf_1/meta_features_1/knn_cos 0 1 2 3 4 
imdb_reviews tf_idf_1/fs 0 1 2 3 4 
reut fast_text_1/meta_features_1/knn_cos 0 1 2 3 4 5 6 7 8 9 


Formatting splits (save split files and representation configs files):

# Formatting Output