In [1]:
import pandas as pd
import numpy as np
import random
import math

In [24]:
def linear_dep(row, scale=1, offset=0, noise=0):
    return row['target'] * scale + offset + noise * scale * (random.random() - 0.5)


def build_linear_dep(scale=None, noise=None):
    _scale = random.uniform(1, 100) if scale == None else scale
    _noise = random.uniform(0.05, 0.1) if noise == None else noise
    return lambda row: linear_dep(row, _scale, _noise)


In [32]:
def non_linear_dep(row, scale=1, noise=0, offset=0, shift=0):
    return scale * math.cos(row['target'] * math.pi / 2 * (1 + shift + noise * (random.random() - 0.5))) + offset


def build_non_linear_dep(scale=None, noise=None,  offset=None, shift=None):
    _scale = random.uniform(1, 100) if scale == None else scale
    _noise = random.uniform(0.05, 0.1) if noise == None else noise
    _offset = random.uniform(0, 20) if offset == None else offset
    _shift = random.uniform(0, 1) if shift == None else shift
    return lambda row: non_linear_dep(row, _scale, _noise, _offset, _shift)


In [52]:
def build_category_dep(category_count=10, split_index=5):
    list = random.sample(range(1, category_count + 1), category_count)
    listA = list[:split_index]
    listB = list[split_index:]

    # print('target > 0.5:', listA)
    # print('target < 0.5:', listB)

    def category_lambda(row, noise=0):
        if row['target'] > 0.5:
            if random.random() > noise:
                return random.choice(listA)
            else:
                return random.choice(listB)
        else:
            if random.random() > noise:
                return random.choice(listB)
            else:
                return random.choice(listA)

    return category_lambda


category_dep = build_category_dep()


In [37]:
def linear_noise(scale=1, offset=0):
    return offset + scale * random.random()


def build_linear_noise(scale=None, offset=None):
    _scale = random.uniform(1, 100) if scale == None else scale
    _offset = random.uniform(0, 20) if offset == None else offset
    return lambda _: linear_noise(_scale, _offset)


In [44]:
def category_noise(category_count=10):
    return random.randint(0, category_count) + 1


def build_category_noise(category_count=None):
    _category_count = random.randint(
        2, 10) if category_count == None else category_count
    return lambda _: category_noise(_category_count)


In [64]:
def generate_numerical_dataset(dataset_length=10):

    df = pd.DataFrame(np.random.random(
        size=dataset_length), columns=['target'])

    lambda_builders = [build_linear_dep,
                       build_non_linear_dep,
                       build_category_dep,
                       build_linear_noise,
                       build_category_noise]

    lambdas = sum([[builder() for i in range(6)]
                   for builder in lambda_builders], [])

    for i in range(len(lambdas)):
        df['x{}'.format(i+1)] = df.apply(lambdas[i], axis=1)

    return df


In [66]:
num_df = generate_numerical_dataset()
num_df

Unnamed: 0,target,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30
0,0.635333,58.935203,3.774338,37.936338,2.315038,32.674967,29.090916,13.237348,23.145365,11.560255,...,56.561889,21.777896,91.619344,13.042594,4,1,7,4,1,1
1,0.265685,24.700426,1.62604,15.908967,0.999354,13.695383,12.202344,14.991992,36.786969,70.789454,...,18.060219,21.749539,74.264415,27.820441,3,8,6,2,2,1
2,0.576055,53.445149,3.429827,34.403922,2.104049,29.631309,26.382583,13.473799,26.379325,24.691791,...,37.038909,16.557528,57.251325,8.615545,4,2,8,1,4,5
3,0.915674,84.898788,5.403603,54.64185,3.31285,47.069045,41.899174,11.650443,11.404036,-35.783151,...,60.032094,14.318544,63.12848,16.866479,4,9,1,3,3,3
4,0.123817,11.561369,0.801539,7.455024,0.494404,6.411158,5.720633,15.377731,39.577561,83.342101,...,14.277794,15.525324,12.026503,19.993354,3,2,5,6,6,6
5,0.193795,18.04237,1.208234,11.625035,0.743477,10.004192,8.917817,15.223842,38.398011,78.55789,...,31.963078,26.984605,49.912239,11.170537,1,7,5,4,4,3
6,0.256611,23.860033,1.573304,15.368241,0.967057,13.229474,11.787764,15.041712,37.191808,71.631699,...,38.381035,13.17447,83.367119,17.931159,1,3,3,5,4,1
7,0.447625,41.550688,2.683426,26.750778,1.64693,23.037082,20.514852,14.229879,31.454147,45.538895,...,49.526403,24.812628,27.897337,16.841534,3,7,5,2,5,7
8,0.543388,50.419734,3.239976,32.457307,1.987778,27.954035,24.890096,13.716917,27.115243,30.598233,...,19.644438,22.782951,56.259903,20.271393,4,8,6,2,4,4
9,0.387539,35.985901,2.334225,23.170277,1.433069,19.951992,17.769652,14.530697,33.52747,54.983238,...,24.245458,11.850958,11.89095,24.409104,1,7,4,3,5,2
