In [31]:
import pandas as pd
import numpy as np
import random
import math

In [32]:
def linear_dep(row, scale=1, offset=0, noise=0):
    return row['target'] * scale + offset + noise * scale * (random.random() - 0.5)


def build_linear_dep(scale=None, noise=None):
    _scale = random.uniform(1, 100) if scale == None else scale
    _noise = random.uniform(0.05, 0.1) if noise == None else noise
    return lambda row: linear_dep(row, _scale, _noise)


In [33]:
def non_linear_dep(row, scale=1, noise=0, offset=0, shift=0):
    return scale * math.cos(row['target'] * math.pi / 2 * (1 + shift + noise * (random.random() - 0.5))) + offset


def build_non_linear_dep(scale=None, noise=None,  offset=None, shift=None):
    _scale = random.uniform(1, 100) if scale == None else scale
    _noise = random.uniform(0.05, 0.1) if noise == None else noise
    _offset = random.uniform(0, 20) if offset == None else offset
    _shift = random.uniform(0, 1) if shift == None else shift
    return lambda row: non_linear_dep(row, _scale, _noise, _offset, _shift)


In [34]:
def build_category_dep(category_count=10, split_index=5):
    list = random.sample(range(1, category_count + 1), category_count)
    listA = list[:split_index]
    listB = list[split_index:]

    # print('target > 0.5:', listA)
    # print('target < 0.5:', listB)

    def category_lambda(row, noise=0):
        if row['target'] > 0.5:
            if random.random() > noise:
                return random.choice(listA)
            else:
                return random.choice(listB)
        else:
            if random.random() > noise:
                return random.choice(listB)
            else:
                return random.choice(listA)

    return category_lambda


category_dep = build_category_dep()


In [35]:
def linear_noise(scale=1, offset=0):
    return offset + scale * random.random()


def build_linear_noise(scale=None, offset=None):
    _scale = random.uniform(1, 100) if scale == None else scale
    _offset = random.uniform(0, 20) if offset == None else offset
    return lambda _: linear_noise(_scale, _offset)


In [36]:
def category_noise(category_count=10):
    return random.randint(0, category_count) + 1


def build_category_noise(category_count=None):
    _category_count = random.randint(
        2, 10) if category_count == None else category_count
    return lambda _: category_noise(_category_count)


In [37]:
def generate_numerical_dataset(dataset_length=10):

    df = pd.DataFrame(np.random.random(
        size=dataset_length), columns=['target'])

    lambda_builders = [build_linear_dep,
                       build_non_linear_dep,
                       build_category_dep,
                       build_linear_noise,
                       build_category_noise]

    lambdas = sum([[builder() for i in range(6)]
                   for builder in lambda_builders], [])

    for i in range(len(lambdas)):
        df['x{}'.format(i+1)] = df.apply(lambdas[i], axis=1)

    return df


In [38]:
num_df = generate_numerical_dataset()
num_df

Unnamed: 0,target,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30
0,0.094422,9.506132,6.682251,8.74018,8.557205,4.044812,3.530198,109.171061,59.418549,40.770592,...,33.5807,16.216202,17.844832,50.004556,4,11,3,6,3,3
1,0.483448,48.337357,33.879164,44.528842,43.475175,20.479677,17.768056,52.796519,33.859422,26.142854,...,15.27373,25.71344,27.60144,16.324439,2,7,2,1,1,3
2,0.771509,77.090564,54.017557,71.029134,69.330748,32.649138,28.310707,-15.598814,2.938018,8.654035,...,14.088523,28.021127,14.793822,42.457149,4,4,4,5,2,1
3,0.356771,35.692939,25.023169,32.875159,32.105011,15.128074,13.131853,77.682278,44.725857,32.204591,...,16.444771,22.011843,6.693999,24.695352,4,5,2,6,4,2
4,0.472815,47.275956,33.135772,43.550606,42.520737,20.030451,17.378882,56.084271,34.872361,26.397569,...,45.437196,16.387537,22.684821,21.83165,3,1,2,4,4,1
5,0.444194,44.419141,31.134894,40.917632,39.951821,18.821338,16.331402,60.211729,37.385368,28.788519,...,26.807465,14.486202,1.544833,32.177906,5,6,1,5,2,2
6,0.105378,10.599763,7.448217,9.748122,9.540624,4.507678,3.931189,108.364441,59.210301,40.536052,...,12.646819,7.642097,17.088684,58.095447,3,6,2,7,1,1
7,0.296625,29.689303,20.818296,27.341929,26.706397,12.587105,10.93056,87.161235,49.465345,35.183522,...,43.016172,10.624676,26.862777,14.473966,4,9,1,7,4,2
8,0.755724,75.514932,52.914004,69.576959,67.913903,31.98227,27.732986,-10.222139,4.74625,8.459605,...,41.472027,12.082503,25.082791,26.44756,4,4,2,3,3,2
9,0.028811,2.957142,2.095419,2.704327,2.668196,1.273028,1.128945,111.397809,60.507938,41.317809,...,48.296941,10.427075,27.087934,24.653197,4,2,3,7,3,3


In [39]:
def build_category_to_numerical_dep(categories, noise=None):

    list = random.sample(categories, len(categories)//2)
    _noise = random.uniform(0.05, 0.1) if noise == None else noise

    def category_lambda(row):
        if row['target'] in list:
            return random.uniform(0, 0.5) + (random.random() - 0.5) * _noise
        else:
            return random.uniform(0.5, 1) + (random.random() - 0.5) * _noise

    return category_lambda


In [40]:
def generate_categorical_dataset(dataset_length=10, categories_number=10):

    categories = range(1, categories_number+1)

    df = pd.DataFrame(np.random.choice(categories,
                      size=dataset_length), columns=['target'])

    lambda_builders = [lambda :build_category_to_numerical_dep(categories),
                       build_linear_noise,
                       build_category_noise]

    lambdas = sum([[builder() for i in range(10)]
                   for builder in lambda_builders], [])

    for i in range(len(lambdas)):
        df['x{}'.format(i+1)] = df.apply(lambdas[i], axis=1)

    return df


In [41]:
cat_df = generate_categorical_dataset()
cat_df

Unnamed: 0,target,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30
0,3,0.172847,0.562707,0.164292,0.212183,0.309543,0.673012,0.326863,0.483038,0.300763,...,7,5,5,2,7,2,1,8,3,10
1,1,0.352876,0.292886,0.875419,0.853449,0.722092,0.286163,0.556402,0.473729,0.79797,...,5,2,2,4,1,5,8,3,5,5
2,1,0.078544,0.37203,0.66653,0.483485,0.542596,0.068547,0.863601,0.066418,0.735057,...,2,4,3,5,6,1,7,3,3,4
3,1,0.100634,0.044886,0.677849,0.674219,0.698605,0.027275,0.938643,0.404152,0.927591,...,5,3,4,3,1,2,9,7,3,7
4,1,0.348953,0.043168,0.684553,0.644695,0.641955,0.098345,0.640671,0.148202,0.971343,...,2,3,6,6,5,2,2,4,1,1
5,8,0.778786,0.88549,0.432217,0.466058,0.58309,0.486637,0.262083,0.246847,0.684366,...,6,5,4,6,5,4,9,7,1,7
6,8,0.822129,0.690983,-0.013604,0.044593,0.760432,0.169432,0.410235,0.347171,0.992305,...,11,4,5,4,7,5,8,4,9,1
7,4,0.38054,0.094121,0.053437,0.328836,0.829539,0.048024,0.584236,0.283128,0.985361,...,2,1,1,5,2,4,6,4,3,4
8,3,0.443809,0.563207,0.196308,0.237554,0.042828,0.787935,0.193373,0.690034,0.469801,...,1,4,6,6,6,5,9,4,8,4
9,9,0.93596,0.43848,0.662125,0.77635,0.131244,0.690954,0.224048,0.870062,0.987409,...,3,5,5,1,1,2,3,5,3,3
