In [67]:
import pandas as pd
import numpy as np
import random
import math

In [68]:
def linear_dep(row, scale=1, offset=0, noise=0):
    return row['target'] * scale + offset + noise * scale * (random.random() - 0.5)


def build_linear_dep(scale=None, noise=None):
    _scale = random.uniform(1, 100) if scale == None else scale
    _noise = random.uniform(0.05, 0.1) if noise == None else noise
    return lambda row: linear_dep(row, _scale, _noise)


In [69]:
def non_linear_dep(row, scale=1, noise=0, offset=0, shift=0):
    return scale * math.cos(row['target'] * math.pi / 2 * (1 + shift + noise * (random.random() - 0.5))) + offset


def build_non_linear_dep(scale=None, noise=None,  offset=None, shift=None):
    _scale = random.uniform(1, 100) if scale == None else scale
    _noise = random.uniform(0.05, 0.1) if noise == None else noise
    _offset = random.uniform(0, 20) if offset == None else offset
    _shift = random.uniform(0, 1) if shift == None else shift
    return lambda row: non_linear_dep(row, _scale, _noise, _offset, _shift)


In [70]:
def build_category_dep(category_count=10, split_index=5):
    list = random.sample(range(1, category_count + 1), category_count)
    listA = list[:split_index]
    listB = list[split_index:]

    # print('target > 0.5:', listA)
    # print('target < 0.5:', listB)

    def category_lambda(row, noise=0):
        if row['target'] > 0.5:
            if random.random() > noise:
                return random.choice(listA)
            else:
                return random.choice(listB)
        else:
            if random.random() > noise:
                return random.choice(listB)
            else:
                return random.choice(listA)

    return category_lambda


category_dep = build_category_dep()


In [71]:
def linear_noise(scale=1, offset=0):
    return offset + scale * random.random()


def build_linear_noise(scale=None, offset=None):
    _scale = random.uniform(1, 100) if scale == None else scale
    _offset = random.uniform(0, 20) if offset == None else offset
    return lambda _: linear_noise(_scale, _offset)


In [72]:
def category_noise(category_count=10):
    return random.randint(0, category_count) + 1


def build_category_noise(category_count=None):
    _category_count = random.randint(
        2, 10) if category_count == None else category_count
    return lambda _: category_noise(_category_count)


In [73]:
def generate_numerical_dataset(dataset_length=10_000):

    df = pd.DataFrame(np.random.random(
        size=dataset_length), columns=['target'])

    lambda_builders = [build_linear_dep,
                       build_non_linear_dep,
                       build_category_dep,
                       build_linear_noise,
                       build_category_noise]

    lambdas = sum([[builder() for i in range(6)]
                   for builder in lambda_builders], [])

    for i in range(len(lambdas)):
        df['x{}'.format(i+1)] = df.apply(lambdas[i], axis=1)

    return df


In [74]:
num_df = generate_numerical_dataset()
num_df

Unnamed: 0,target,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30
0,0.781403,32.975111,44.826759,27.759833,77.913740,2.245265,33.194357,-0.858472,-4.265136,-10.505634,...,23.485086,4.483598,7.100648,58.231556,6,7,1,2,4,1
1,0.480664,20.316036,27.597777,17.113050,47.960037,1.413864,20.443463,15.415275,14.149126,32.134550,...,13.037980,5.931959,28.503801,6.812096,7,1,1,3,1,1
2,0.124489,5.323436,7.192872,4.503681,12.484786,0.429204,5.342120,28.969347,28.583835,70.171567,...,14.078739,6.147084,8.581087,31.928678,5,5,2,3,1,3
3,0.416937,17.633540,23.946905,14.856965,41.612759,1.237687,17.741511,17.906273,16.809075,42.283960,...,12.695960,12.870892,3.815553,65.604960,3,5,1,3,3,4
4,0.811357,34.235974,46.542791,28.820268,80.897175,2.328074,34.464365,-1.679772,-6.300884,-17.125763,...,12.895488,3.904772,35.910000,15.988580,6,9,3,1,5,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.941956,39.733312,54.024659,33.443746,93.904887,2.689119,40.001576,-6.491713,-10.414550,-31.101589,...,13.147778,6.731727,8.078441,43.181109,2,6,2,3,3,1
9996,0.616220,26.022032,35.363628,21.912018,61.461474,1.788613,26.190845,7.397094,5.478733,9.966005,...,21.762363,16.938920,19.197348,16.935108,3,3,3,2,8,3
9997,0.620577,26.205445,35.613254,22.066276,61.895464,1.800659,26.375589,7.156440,5.097970,11.521659,...,12.354138,6.630029,19.297376,41.530446,1,4,3,1,3,2
9998,0.536093,22.649207,30.773219,19.075339,53.480749,1.567098,22.793557,12.207048,10.162668,23.339074,...,16.111926,16.541741,31.295196,39.539161,7,5,2,3,3,3


In [75]:
def build_category_to_numerical_dep(categories, noise=None):

    list = random.sample(categories, len(categories)//2)
    _noise = random.uniform(0.05, 0.1) if noise == None else noise

    def category_lambda(row):
        if row['target'] in list:
            return random.uniform(0, 0.5) + (random.random() - 0.5) * _noise
        else:
            return random.uniform(0.5, 1) + (random.random() - 0.5) * _noise

    return category_lambda


In [76]:
def generate_categorical_dataset(dataset_length=10_000, categories_number=10):

    categories = range(1, categories_number+1)

    df = pd.DataFrame(np.random.choice(categories,
                      size=dataset_length), columns=['target'])

    lambda_builders = [lambda :build_category_to_numerical_dep(categories),
                       build_linear_noise,
                       build_category_noise]

    lambdas = sum([[builder() for i in range(10)]
                   for builder in lambda_builders], [])

    for i in range(len(lambdas)):
        df['x{}'.format(i+1)] = df.apply(lambdas[i], axis=1)

    return df


In [77]:
cat_df = generate_categorical_dataset()
cat_df

Unnamed: 0,target,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30
0,10,0.390739,0.285509,0.458316,0.060146,0.407601,0.760295,0.452805,0.680391,0.613638,...,11,4,5,5,1,9,8,6,5,3
1,5,0.430160,0.564910,0.198422,0.318145,0.485985,0.411926,0.706957,0.064732,0.091748,...,2,5,4,1,3,4,8,5,5,1
2,10,0.039237,0.430235,0.060158,0.500482,0.257117,0.814530,0.329865,0.777831,0.618708,...,7,4,5,2,4,3,9,4,5,4
3,8,0.845640,0.811217,0.021017,0.850280,0.111030,0.961682,0.595325,0.071242,0.311528,...,11,5,2,3,5,2,8,5,5,4
4,7,1.010295,0.959756,0.351937,0.216248,0.850470,0.312604,0.852050,0.325545,0.671633,...,4,3,2,4,5,8,1,5,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,10,0.230086,0.167406,0.189513,0.182744,0.274070,0.692104,0.350118,0.800132,0.498978,...,3,3,2,4,5,2,2,10,5,1
9996,2,0.762944,0.979080,0.380051,0.420345,0.044018,0.435896,0.167836,0.975261,0.511987,...,5,6,3,5,3,2,5,10,4,2
9997,8,0.889403,0.659568,0.433134,0.889535,0.080801,0.875827,0.590529,0.183375,0.382441,...,8,4,4,1,4,7,7,5,5,4
9998,5,0.142633,0.507418,0.482209,0.015481,0.266772,0.079895,0.501047,0.315733,0.436031,...,4,6,2,3,5,9,7,3,3,3
