In [4]:
import pandas as pd
import os
import re
import json
import gc

In [5]:
class AlphaBoosting:
    def __init__(self, root=None, train_csv_url=None, test_csv_url=None, validation_index=None, timestamp=None,
                 label=None, categorical_features=None, numerical_features=None, validation_ratio=0.1, ngram=(1,1),
                 downsampling=1, configuration=None):
        if configuration == None:
            # set configurations
            self.ROOT = root
            self.OUTDIR = root + 'output/'
            self.LOGDIR = root + 'log/'
            self.train_csv_url = train_csv_url
            self.test_csv_url = test_csv_url
            self.timestamp = timestamp
            self.label = label
            self.categorical_features = categorical_features
            self.numerical_features = numerical_features
            self.downsampling = downsampling
            # read data
            self._read_data()
            if validation_index == None:
                self.validation_index = list(range(int(self.train_len*(1-validation_ratio)), self.train_len))
            else:
                self.validation_index = validation_index
        else:
            with open(configuration, 'r') as file:
                self.ROOT = file['root']
                self.OUTDIR = file['root'] + 'output/'
                self.LOGDIR = file['root'] + 'log/'
                self.train_csv_url = file['train_csv_url']
                self.test_csv_url = file['test_csv_url']
                # read data
                self._read_data()
                self.timestamp = file['timestamp']
                self.label = file['label']
                self.categorical_features = file['categorical_features']
                self.numerical_features = file['numerical_features']
                self.validation_index = file['validation_index']
                self.downsampling = file['downsampling']
        
        # biuld relavent directories
        if not os.path.exists(self.OUTDIR): os.makedirs(self.OUTDIR)
        if not os.path.exists(self.LOGDIR): os.makedirs(self.LOGDIR)
            
        # save configuration
        self._save_config(self.LOGDIR + 'config.json')
        
        # generate todo list
        dictionary = self._generate_todo_list()

        # feature engineering
        self._feature_engineering(dictionary)
        
        # down sampling
        self._down_sampling(dictionary)
        
        # get validation
        self._generate_validation(dictionary)
        
        # grid search
        self._grid_search(dictionary)
    
    
    ######### util functions #########
    def _read_data(self):
        self.train = pd.DataFrame()
        self.test = pd.DataFrame()
        if self.train_csv_url != None: self.train = pd.read_csv(self.train_csv_url)
        if self.test_csv_url != None: self.test = pd.read_csv(self.test_csv_url)
        self.df = pd.concat([self.train, self.test], ignore_index=True)
        self.train_len = self.train.shape[0]
        
    def _renew_status(self, dictionary, key, file_url):
        dictionary[key] = True
        with open(file_url, 'w') as file:
            json.dump(dictionary, file)
            
    def _save_config(self, url):
        d = {
            'root':                 self.ROOT,
            'train_csv_url':        self.train_csv_url,
            'test_csv_url':         self.test_csv_url,
            'timestamp':            self.timestamp,
            'label':                self.label,
            'categorical_features': self.categorical_features,
            'numerical_features':   self.numerical_features,
            'validation_index':     self.validation_index,
            'downsampling':         self.downsampling
        }
        with open(url, 'w') as file: json.dump(d, file)
        del d
        gc.collect()
            
            
    
    ######### main functions #########
    def _generate_todo_list(self):
        if os.path.exists(self.LOGDIR + 'todo_list.json'):
            with open(self.LOGDIR + 'todo_list.json', 'r') as file:
                dictionary = json.load(file)
        else:
            dictionary = {'feature_engineering': False, 
                          'down_sampling': (self.downsampling==0),
                          'validation_generation': False,
                          'grid_search': False}
            with open(self.LOGDIR + 'todo_list.json', 'w') as file: json.dump(dictionary, file)
        return dictionary
    
    def _feature_engineering(self, dictionary):
        feature_engineering_file_url = self.LOGDIR + 'feature_engineering.txt'
        if not dictionary['feature_engineering']:
            if not os.path.exists(feature_engineering_file_url):
                self._generate_feature_engineering_file(feature_engineering_file_url)
            with open(feature_engineering_file_url, 'r') as file:
                line = file.readline()
                while line:
                    line = re.sub('\\n', '', line)
                    self._add_column(line)
                    # TODO
                    # feature engineering: create feature line by line in the file
        self._renew_status(dictionary, 'feature_engineering', (self.LOGDIR + 'todo_list.json'))
    
    def _down_sampling(self, dictionary):
        down_sampling_file_url = self.LOGDIR + 'downsampling.json'
        down_sampling_url = self.ROOT + 'output/downsampling/'
        if not os.path.exists(down_sampling_url): os.makedirs(down_sampling_url)
        if not dictionary['down_sampling']:
            if not os.path.exists(down_sampling_file_url):
                self._generate_down_sampling_file(down_sampling_file_url)
            with open(down_sampling_file_url, 'r') as file:
                downsampling_work_log = json.load(file)
            index_list = self._generate_down_sampling_index_file(downsampling_work_log['generate_index_file'])
            self._renew_status(downsampling_work_log, 'generate_index_file', self.LOGDIR + 'downsampling.json')
            
            # split file
            for i in range(self.downsampling):
                if not downsampling_work_log[str(i)]:
                    if not os.path.exists(down_sampling_url+str(i)+'/'): os.makedirs(down_sampling_url + str(i) + '/')
                    self._get_file_split(index_list[i], down_sampling_url + str(i) + '/')
                    self._renew_status(downsampling_work_log, str(i), self.LOGDIR + 'downsampling.json')
                    
            # concatenant file
            for i in range(self.downsampling):
                if not downsampling_work_log['concat' + str(i)]:
                    self._get_file_concat(down_sampling_url + str(i) + '/', down_sampling_url)
                    self._renew_status(downsampling_work_log, str(i), self.LOGDIR + 'downsampling.json')
                    
            self._renew_status(dictionary, 'concat' + str(i), self.LOGDIR + 'todo_list.json')
    
    def _generate_validation(self):
        pass
    
    def _grid_search(self, dictionary):
        pass
    
    
    ######### support functions #########
    # feature engineering
    def _generate_feature_engineering_file(self, feature_engineering_file_url):
        with open(feature_engineering_file_url, 'w') as file:
            pass
    
    def _add_column(self):
        pass
        
    # down sampling
    def _generate_down_sampling_file(self, down_sampling_file_url):
        dictionary = {'generate_index_file': False}
        for i in range(self.downsampling):
            dictionary[str(i)] = False
            dictionary['concat' + str(i)] = False
        with open(down_sampling_file_url, 'w') as file: json.dump(dictionary, file)
    
    def _generate_down_sampling_index_file(self, has_file_built):
        return [[1,2,3]]
    
    def _get_file_split(self, index, split_folder):
        pass
    
    def _get_file_concat(self, split_folder, concat_folder):
        pass
    
    # grid search
    def _generate_grid_search_file(self):
        pass
    
    """ 
    feature_engineering todo list
    file_name: train__<function_name>__<feature_combination_name>__<possible_param>.ftr
                test__<function_name>__<feature_combination_name>__<possible_param>.ftr
    """

    """
    down sampling todo list
    if positive : negative < 1 : 1 in train (validation excluded), do down sampling, sample 5 files in default
    """
#     train_exclude_val = self.train.drop(self.validation_index, axis=0)
#     positive = list(train_exclude_val[train_exclude_val[self.label==1]].index.values)
#     negative = list(train_exclude_val[train_exclude_val[self.label==0]].index.values)
#     ratio = len(positive) / len(negative)
#     if ratio >= 0.5:
#         dictionary['down_sampling'] = True
#     else:
#         down_sampling_dict = {}

In [6]:
a = AlphaBoosting('./')

TypeError: _generate_validation() takes 1 positional argument but 2 were given

In [11]:
class a:
    def __init__(self):
        b = {}
        self.f(b)
        print(b)
    
    def f(self, d):
        d['a'] = 1
        
m = a()

{'a': 1}
