In [9]:
import random
import pandas as pd
import os

In [10]:
num_rows = 1000000
num_features = 10
number_of_columns = 100
col_names = ["i1", "i2", "i3", "i4"] + [f"i{n}" for n in range(5,number_of_columns+1)]

In [11]:
# train, test, validation rel. sizes and indices inside the single common dataframe
train_size = 0.8
test_size = 0.1
validation_size = 0.1

train_st = 0
train_end = int(train_size * num_rows)
test_st = (num_rows-int(test_size * num_rows))
test_end = num_rows
validation_st = (num_rows-int((validation_size + test_size) * num_rows))
validation_end = test_st

In [12]:
# Initialize the hashtable - represents random function: 3-tuple -> binary
hashtable = {}

# Generate keys and assign random binary values
for i1 in range(num_features):
    for i2 in range(num_features, 2 * num_features):
        for i3 in range(2 * num_features, 3 * num_features):
            for i4 in range(3 * num_features, 4 * num_features):
                hashtable[(i1, i2, i3, i4)] = random.randint(0, 1)

In [13]:
num_indices = number_of_columns

data = {}
for idx in range(num_indices):
    key = f'i{idx + 1}'  # i1, i2, i3, ..., i100
    data[key] = [random.randint(idx * num_features, (idx + 1) * num_features - 1) 
                 for _ in range(num_rows)]

# Create DataFrame
df = pd.DataFrame(data)

# Add 'label' column based on hashtable
df['label'] = df.apply(lambda row: hashtable[(row['i1'], row['i2'], row['i3'], row['i4'])], axis=1)

print(df.head())

   i1  i2  i3  i4  i5  i6  i7  i8  i9  i10  ...  i92  i93  i94  i95  i96  i97  \
0   2  14  24  35  46  59  67  78  89   92  ...  914  927  937  949  950  966   
1   1  18  28  37  47  52  64  71  83   92  ...  913  924  931  946  953  965   
2   2  13  21  34  41  54  69  78  82   94  ...  910  922  931  948  959  960   
3   7  10  22  32  48  54  62  79  86   90  ...  912  924  930  940  956  965   
4   8  12  27  33  43  57  61  78  86   97  ...  919  927  939  942  952  969   

   i98  i99  i100  label  
0  974  987   991      1  
1  971  984   991      0  
2  976  981   995      1  
3  979  985   991      0  
4  974  982   997      1  

[5 rows x 101 columns]


# Sanity - verify that logistic regression's log-loss is between the naive and the optimal

In [15]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

In [16]:
pipeline = make_pipeline(
    make_column_transformer(
        (OneHotEncoder(), col_names),
        remainder="passthrough"
    ),
    LogisticRegressionCV()
)

In [17]:
train, test = train_test_split(df, train_size=0.8)

In [18]:
pipeline = pipeline.fit(train.drop(columns=["label"]), train["label"])

In [19]:
prediction = pipeline.predict_proba(test.drop(columns=["label"]))[:, 1]

In [20]:
log_loss(test["label"], prediction)

0.6912283063829319

In [21]:
roc_auc_score(test["label"], prediction)

0.5344221127393676

In [22]:
#####################################

# Save the Dataset

In [25]:
user = os.getenv("USER")

In [26]:
path=f"/home/{user}/persistent_drive/pytorch-tensorfm/data/test-datasets/random_binary_function_4_100_cols"

In [27]:
df[train_st:train_end].to_csv(f"{path}/train.csv", index=False)

In [28]:
df[test_st:test_end].to_csv(f"{path}/test.csv", index=False)

In [29]:
df[validation_st:validation_end].to_csv(f"{path}/validation.csv", index=False)