In [44]:
import pandas as pd
import numpy as np
from faker import Faker

In [45]:
fake = Faker()

In [51]:
def _nuller(sr, fillrate):
    if fillrate < 1:
        sr.loc[sr.sample(frac=1-fillrate).index] = np.NaN
    return sr
    

def numericalsr(length, name=fake.pystr(), mean=0, std=1, fillrate=1):
    return _nuller(pd.Series(np.random.normal(mean, std, length), name=name), fillrate)

def categoricalsr(length, name=fake.pystr(), classes=3, fillrate=1, rates=[1/3, 1/3, 1/3]):
    if isinstance(classes, int):
        fake = Faker()
        classes = [fake.pystr(max_chars=5) for i in range(classes)]
        rates=None
    elif len(classes) != len(rates):
        raise ValueError("The number of classes much match the rate array of probabilities")
    return _nuller(pd.Series(np.random.choice(classes, length), name=name), fillrate)

In [107]:
mysr1 = numericalsr(10, "hello", 0, 3, fillrate=0.3)
mysr1

0         NaN
1    3.353741
2         NaN
3   -1.715854
4         NaN
5         NaN
6    3.004630
7         NaN
8         NaN
9         NaN
Name: hello, dtype: float64

In [108]:
mysr2 = categoricalsr(10, "world", ["foo", "bar"])
mysr2

0    foo
1    bar
2    bar
3    bar
4    bar
5    bar
6    bar
7    bar
8    bar
9    foo
Name: world, dtype: object

In [151]:
def build_df(length, num_numeric=1, num_categoric=0, fillrate=1):
    numericals = [numericalsr(name=f"num_col_{i}", length=length, fillrate=fillrate) for i in range(num_numeric)]
    categoricals = [categoricalsr(name=f"cat_col_{i}", length=length, fillrate=fillrate) for i in range(num_categoric)]
    columns = numericals + categoricals
    np.random.shuffle(columns)
    return pd.concat(columns , axis = 1)

def build_specific_df(length = 5, numerical_cols=["foo", "bar"], categorical_cols=["hello"]):
    numericals = [numericalsr(name=n, length=length) for n in numerical_cols]
    categoricals = [categoricalsr(name=n, length=length) for n in categorical_cols]
    columns = numericals + categoricals
    np.random.shuffle(columns)
    return pd.concat(columns , axis = 1)

def build_explicit_df(columns, length):
    data = [] 
    for c in columns:
        if c["type"] == "numerical":
            d = numericalsr(length = length, name=c["name"])
        elif c["type"] == "categorical":
            d = categoricalsr(length = length, name=c["name"])
        else:
            raise ValueError("Do not recognize type")
        data.append(d)
    return pd.concat(data, axis=1)

In [152]:
df = build_df(5, 3, 1, fillrate=0.4)
df

Unnamed: 0,num_col_1,num_col_2,cat_col_0,num_col_0
0,,,,-0.435743
1,0.688378,,dtfLg,
2,,-0.788613,gWuxm,
3,-2.185066,,,
4,,0.208969,,0.150107


In [153]:
df = build_specific_df()
df

Unnamed: 0,hello,foo,bar
0,fGQTZ,1.844535,0.547812
1,VLmNQ,-0.11503,-0.323428
2,UuYIz,-1.027702,-0.559754
3,fGQTZ,-0.389888,0.853739
4,UuYIz,0.461028,0.422611


In [154]:
columns = [{"type": "numerical", "name": "hello"}, {"type": "categorical", "name": "world"}]
length = 5
df = build_explicit_df(columns, length)
df

Unnamed: 0,hello,world
0,-0.793243,DIHDT
1,-0.734836,NXukK
2,-2.112485,NXukK
3,0.395655,NXukK
4,-1.049983,DIHDT


## As classes

In [145]:
class Base():
    def __init__(self, name, fillrate):
        self.name = name
        self.fillrate = fillrate
        self.value = None
        
    def _nuller(self, sr):
        if self.fillrate < 1:
            sr.loc[sr.sample(frac=1-self.fillrate).index] = np.NaN
        return sr

class Numerical(Base):
    def __init__(self, name, fillrate=1, mu=0, sig=1):
        super().__init__(name, fillrate)
        self.mu = mu
        self.sig = sig
    
    def generate(self, length):
        self.value = self._nuller(pd.Series(np.random.normal(self.mu, self.sig, length), name=self.name))
        return self._nuller(pd.Series(np.random.normal(self.mu, self.sig, length), name=self.name))
    
    def __mul__(self, other):
        mult_values = self.generate(10) * self.generate(10)
        nc = Base(name = f"{self.name} + {other.name}", fillrate=np.NaN)
        nc.value = mult_values
        nc.fillrate = nc.value.notna().mean()
        return nc
        
    
class Categorical(Base):
    def __init__(self, name, classes, fillrate=1, rates=None):
        super().__init__(name, fillrate)
        if rates is not None and len(classes) != len(rates):
            raise ValueError("The number of classes much match the rate array of probabilities")
        else:
            self.rates=rates
        self.classes = classes

    def generate(self, length):
        return self._nuller(pd.Series(np.random.choice(self.classes, length, p=self.rates), name=self.name))
    
    
    
class NumericalCategorical(Categorical):
    def __init__(self, name, classes=2, fillrate=1, rates=None):
        if isinstance(classes, int):
            classes = np.arange(2)
        super().__init__(name, classes, fillrate, rates)


class StringCategorical(Categorical):
    def __init__(self, name, classes=2, fillrate=1, rates=None):
        if isinstance(classes, int):
            fake = Faker()
            classes = [fake.pystr(max_chars=5) for i in range(classes)]

        super().__init__(name, classes, fillrate, rates)

In [141]:
n1 = Numerical("foo", fillrate=0.9)
n2 = Numerical("bar", fillrate=0.9)

In [142]:
Categorical("foo", classes=[0, 1]).generate(10)

[0, 1]


0    1
1    0
2    1
3    0
4    1
5    0
6    0
7    0
8    1
9    1
Name: foo, dtype: int64

In [143]:
NumericalCategorical("foo").generate(10)

[0 1]


0    0
1    1
2    0
3    1
4    0
5    1
6    0
7    1
8    1
9    1
Name: foo, dtype: int64

In [144]:
StringCategorical("bar").generate(10)

['yiEII', 'cQMDR']


0    cQMDR
1    yiEII
2    yiEII
3    yiEII
4    cQMDR
5    cQMDR
6    cQMDR
7    cQMDR
8    cQMDR
9    yiEII
Name: bar, dtype: object

In [79]:
out = n1 * n2

In [80]:
out.value.notna().mean()

0.8

In [81]:
Numerical("foo", fillrate=0.2).generate(10)

0         NaN
1         NaN
2         NaN
3         NaN
4    0.650986
5         NaN
6         NaN
7    0.478742
8         NaN
9         NaN
Name: foo, dtype: float64

In [82]:
Categorical("bar", classes=["cat", "dog"], fillrate=0.2).generate(10)

0    NaN
1    dog
2    NaN
3    NaN
4    NaN
5    NaN
6    cat
7    NaN
8    NaN
9    NaN
Name: bar, dtype: object

In [151]:
def make_fake_df(columns, length):
    return pd.concat([n.generate(length) for n in columns], axis=1)

def make_ml_df(length, num_numerical=3, num_categorical=0, target="regression"):
        numerics = [Numerical(f"col{n}") for n in range(num_numerical)]
        categorics = [StringCategorical(f"col{n}") for n in range(num_numerical, num_numerical+num_categorical)]
        columns = numerics + categorics
        if target == "regression":
            columns.append(Numerical(f"target"))
        elif target == "binary":
            columns.append(NumericalCategorical(f"target", classes=2))
        else:
            print("No target")

        return pd.concat([n.generate(length) for n in columns], axis=1)


In [152]:
make_ml_df(10, num_categorical=1)

Unnamed: 0,col0,col1,col2,col3,target
0,1.166716,0.165018,-0.372199,CadWP,-0.277651
1,-0.371489,-0.272256,0.829776,CadWP,1.14433
2,0.205692,0.116641,-1.160677,CadWP,-0.161559
3,-0.168146,-0.206963,0.102786,CadWP,-1.093907
4,-1.381588,-0.098769,0.071964,CadWP,-0.657218
5,1.725209,1.547574,-1.155772,CadWP,0.096041
6,-1.232182,-0.626387,-0.941681,CadWP,-0.572916
7,0.980313,0.116887,0.084984,wuVDs,0.354939
8,0.138823,0.767381,-1.186398,wuVDs,-0.878667
9,1.24542,1.470808,1.168506,CadWP,-0.061453


In [153]:
make_ml_df(10, num_categorical=1, target="binary")

Unnamed: 0,col0,col1,col2,col3,target
0,0.692361,1.808478,2.441017,Lpowc,0
1,1.062888,-1.045956,0.531568,Lpowc,0
2,-0.749478,-0.283598,0.598761,Lpowc,1
3,0.067989,-1.484349,0.529063,Lpowc,1
4,0.748349,-1.199722,0.28261,Lpowc,0
5,2.220727,-0.870024,0.534309,Lpowc,1
6,-1.801828,0.308892,0.959539,Lpowc,0
7,0.949607,-0.429758,-1.160111,iItHQ,1
8,-0.153457,0.287677,1.515233,Lpowc,1
9,-0.135892,0.479812,0.039241,Lpowc,1


In [191]:
make_fake_df([Numerical("col1"), Numerical("col2")], 10)

Unnamed: 0,col1,col2
0,0.44195,-0.570931
1,0.334847,1.810529
2,0.023267,0.000408
3,0.244626,-1.182621
4,-0.63779,0.524023
5,0.727407,-1.065066
6,-1.506381,-1.275591
7,-0.701288,0.962239
8,0.043097,-0.318712
9,-0.143329,-0.578753
