In [1]:
import os
import glob
from pathlib import Path
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.ion() # interactive mode
from torch.utils.data import Dataset, DataLoader

In [26]:
import numpy as np
def gen_random_data(feature_name, num_rows, num_cols, categiries = None):
    mat = np.random.rand(num_rows,num_cols) * 10
    mat = np.around(mat, decimals=3)
    np.savetxt('data/numeric/{}.csv'.format(feature_name), mat, delimiter=',')
gen_random_data('n4', 10, 10)

In [3]:
def num_classes(values):
    n_classes = max(values)
    return n_classes

In [8]:
from sklearn.preprocessing import OneHotEncoder

def to_one_hot(data):
    
    # replace NaN values with 'max(data) + 1'
    data = np.array(data)
    data[np.isnan(data)] = max(data) + 1

    # encode one hot
    enc = OneHotEncoder(handle_unknown='ignore')
    x = data.reshape(len(data), 1)
    return enc.fit_transform(x).toarray()

to_one_hot([0, 1, 1, 2, 1])

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [27]:
class InputDataset(Dataset):
    def __init__(self, data_dir = 'data'):
        """
        Args: data_dir (string): data dir containing numeric and categorical csvs.
        """
        # check data_dir exists
        if not (Path(data_dir)).exists():
            raise FileNotFoundError('directory not found: "{}"'.format(data_dir))

        # this is the main dataframe which will hold all the features
        self.df = pd.DataFrame()
        
        # read numeric csvs
        numeric_data_dir = data_dir + '/numeric'
        if not (Path(numeric_data_dir)).exists():
            raise FileNotFoundError("expected 'numeric' direcroty not found under: '{}'".format(data_dir))
        for f in sorted(glob.glob(numeric_data_dir + '/*.csv')):
            df = pd.read_csv(f, header=None)
            self.create_numeric_feature(f, df)
        
        # read categorical csvs
        categorical_data_dir = data_dir + '/categorical'
        if not (Path(categorical_data_dir)).exists():
            raise FileNotFoundError("expected 'categorical' direcroty not found under: '{}'".format(data_dir))
        for f in sorted(glob.glob(categorical_data_dir + '/*.csv')):
            df = pd.read_csv(f, header=None)
            self.create_categorical_feature(f, df)
    
    def __len__(self):
        return len(self.df.index)
    
    def __getitem__(self, idx):
        return self.df.iloc[idx]

    def create_numeric_feature(self, f, df):
        '''
        concatinate rows to form a single column vector
        '''
        # feature name == filename (without ext)
        feature_name = os.path.splitext(os.path.basename(f))[0]
        
        values = []
        for i in range(len(df.index)):
            values.extend(df.iloc[i].values)

        df_tmp = pd.DataFrame()
        df_tmp[feature_name] = pd.Series(values)
        self.df = pd.concat([self.df, df_tmp], axis=1)
        
    def create_categorical_feature(self, f, df):
        # feature name == filename (without ext)
        feature_name = os.path.splitext(os.path.basename(f))[0]
        
        # concatinate all rows into a single vector (values)
        values = []
        for i in range(len(df.index)):
            values.extend(df.iloc[i].values)

        # encode to one-hot
        one_hot = to_one_hot(values)
        
        # put one-hot matrix into a tmp dataframe
        df_tmp = pd.DataFrame()
        for i in range(one_hot.shape[1]):
            df_tmp[feature_name + '_' + str(i)] = pd.Series(one_hot[:, i])
        
        # concat one-hot dataframe to our main features dataframe
        self.df = pd.concat([self.df, df_tmp], axis=1)

ds = InputDataset()
print(ds.df.shape)
ds.df.head()

(1300000, 23)


Unnamed: 0,n1,n2,n3,n4,c1_0,c1_1,c1_2,c1_3,c1_4,c1_5,...,c1_9,c1_10,c2_0,c2_1,c2_2,c2_3,c2_4,c2_5,c2_6,c2_7
0,4.0,4.0,7.0,2.445,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3.0,1.0,1.0,9.303,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,2.0,3.0,7.288,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.0,3.0,4.0,2.607,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6.0,4.0,5.0,8.564,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
