In [1]:
import pandas as pd
import numpy as np
import random
import math

In [2]:
def linear_dep(row, scale=1, offset=0, noise=0):
    return row['target'] * scale + offset + noise * scale * (random.random() - 0.5)


def build_linear_dep(scale=None, noise=None):
    _scale = random.uniform(1, 100) if scale == None else scale
    _noise = random.uniform(0.05, 0.1) if noise == None else noise
    return lambda row: linear_dep(row, _scale, _noise)


In [3]:
def non_linear_dep(row, scale=1, noise=0, offset=0, shift=0):
    return scale * math.cos(row['target'] * math.pi / 2 * (1 + shift + noise * (random.random() - 0.5))) + offset


def build_non_linear_dep(scale=None, noise=None,  offset=None, shift=None):
    _scale = random.uniform(1, 100) if scale == None else scale
    _noise = random.uniform(0.05, 0.1) if noise == None else noise
    _offset = random.uniform(0, 20) if offset == None else offset
    _shift = random.uniform(0, 1) if shift == None else shift
    return lambda row: non_linear_dep(row, _scale, _noise, _offset, _shift)


In [4]:
def build_category_dep(category_count=10, split_index=5):
    list = random.sample(range(1, category_count + 1), category_count)
    listA = list[:split_index]
    listB = list[split_index:]

    # print('target > 0.5:', listA)
    # print('target < 0.5:', listB)

    def category_lambda(row, noise=0):
        if row['target'] > 0.5:
            if random.random() > noise:
                return random.choice(listA)
            else:
                return random.choice(listB)
        else:
            if random.random() > noise:
                return random.choice(listB)
            else:
                return random.choice(listA)

    return category_lambda


category_dep = build_category_dep()


In [5]:
def linear_noise(scale=1, offset=0):
    return offset + scale * random.random()


def build_linear_noise(scale=None, offset=None):
    _scale = random.uniform(1, 100) if scale == None else scale
    _offset = random.uniform(0, 20) if offset == None else offset
    return lambda _: linear_noise(_scale, _offset)


In [6]:
def category_noise(category_count=10):
    return random.randint(0, category_count) + 1


def build_category_noise(category_count=None):
    _category_count = random.randint(
        2, 10) if category_count == None else category_count
    return lambda _: category_noise(_category_count)


In [7]:
def generate_numerical_dataset(dataset_length=10_000):

    df = pd.DataFrame(np.random.random(
        size=dataset_length), columns=['target'])

    lambda_builders = [build_linear_dep,
                       build_non_linear_dep,
                       build_category_dep,
                       build_linear_noise,
                       build_category_noise]

    lambdas = sum([[builder() for i in range(6)]
                   for builder in lambda_builders], [])

    for i in range(len(lambdas)):
        df['x{}'.format(i+1)] = df.apply(lambdas[i], axis=1)

    return df


In [None]:
num_df = generate_numerical_dataset(100000)
num_df.to_csv('./num_dataset.csv')  
num_df

In [9]:
def build_category_to_numerical_dep(categories, noise=None):

    list = random.sample(categories, len(categories)//2)
    _noise = random.uniform(0.05, 0.1) if noise == None else noise

    def category_lambda(row):
        if row['target'] in list:
            return random.uniform(0, 0.5) + (random.random() - 0.5) * _noise
        else:
            return random.uniform(0.5, 1) + (random.random() - 0.5) * _noise

    return category_lambda


In [10]:
def generate_categorical_dataset(dataset_length=10_000, categories_number=10):

    categories = range(1, categories_number+1)

    df = pd.DataFrame(np.random.choice(categories,
                      size=dataset_length), columns=['target'])

    lambda_builders = [lambda :build_category_to_numerical_dep(categories),
                       build_linear_noise,
                       build_category_noise]

    lambdas = sum([[builder() for i in range(10)]
                   for builder in lambda_builders], [])

    for i in range(len(lambdas)):
        df['x{}'.format(i+1)] = df.apply(lambdas[i], axis=1)

    return df


In [11]:
cat_df = generate_categorical_dataset()
cat_df

Unnamed: 0,target,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30
0,3,0.604648,0.214971,0.486948,0.480191,0.143781,0.851474,0.695174,0.491580,0.552039,...,5,7,3,7,7,4,8,2,2,6
1,2,0.457020,0.599644,0.236226,0.698112,0.137749,0.625636,0.448616,0.446722,0.947833,...,1,6,6,1,5,4,3,4,7,6
2,8,0.391558,0.354158,0.968581,0.197573,0.965081,0.147084,0.035710,0.784178,0.788716,...,4,9,4,5,6,1,11,2,7,7
3,7,0.135549,0.349581,0.705592,0.668239,0.656403,0.557517,0.705571,0.752791,0.089267,...,1,9,6,8,2,6,4,1,10,7
4,10,0.993218,0.499923,0.929377,0.089148,0.927690,0.235416,0.557237,0.058722,0.754848,...,2,4,3,9,7,3,10,2,9,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9,0.678811,0.913003,0.921730,0.695930,0.570438,0.247816,0.086844,0.541848,0.737159,...,6,3,8,7,4,1,10,3,7,3
9996,10,0.958512,0.270732,0.912848,0.239090,0.691969,0.345847,0.829043,0.318814,0.879438,...,5,5,2,3,6,3,8,4,6,2
9997,1,0.214524,0.603359,0.441055,0.408412,0.379301,0.212593,0.133409,0.791371,0.272161,...,4,6,3,11,1,8,10,3,1,6
9998,5,0.142566,0.788216,0.015849,0.008279,0.108209,0.360054,0.625399,0.152978,0.320603,...,5,4,3,9,6,3,2,3,11,5
