In [5]:
import os
import random
import pandas as pd

In [14]:
num_rows = 1000000
num_features = 20
col_names = ["i", "j", "k"]

In [31]:
# train, test, validation rel. sizes and indices inside the single common dataframe
train_size = 0.8
test_size = 0.1
validation_size = 0.1

train_st = 0
train_end = int(train_size * num_rows)
test_st = (num_rows-int(test_size * num_rows))
test_end = num_rows
validation_st = (num_rows-int((validation_size + test_size) * num_rows))
validation_end = test_st

In [9]:
# Initialize the hashtable - represents random function: 3-tuple -> binary
hashtable = {}

# Generate keys and assign random binary values
for i in range(num_features):
    for j in range(num_features, 2 * num_features):
        for k in range(2 * num_features, 3 * num_features):
            hashtable[(i, j, k)] = random.randint(0, 1)

# # Print the hashtable
# for key, value in hashtable.items():
#     print(f"{key}: {value}")

In [10]:
# Generate random values for i, j, k
data = {
    'i': [random.randint(0, num_features-1) for _ in range(num_rows)],
    'j': [random.randint(num_features, 2 * num_features-1) for _ in range(num_rows)],
    'k': [random.randint(2 * num_features, 3 * num_features-1) for _ in range(num_rows)]
}

# Create DataFrame
df = pd.DataFrame(data)

# Add 'label' column based on hashtable
df['label'] = df.apply(lambda row: hashtable[(row['i'], row['j'], row['k'])], axis=1)

print(df.head())

    i   j   k  label
0   8  21  53      0
1  14  28  59      0
2  18  32  55      0
3   2  37  53      1
4   0  28  45      0


# Sanity - verify that logistic regression's log-loss is between the naive and the optimal

In [12]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

In [15]:
pipeline = make_pipeline(
    make_column_transformer(
        (OneHotEncoder(), col_names),
        remainder="passthrough"
    ),
    LogisticRegressionCV()
)

In [16]:
train, test = train_test_split(df, train_size=0.8)

In [17]:
pipeline = pipeline.fit(train.drop(columns=["label"]), train["label"])

In [18]:
prediction = pipeline.predict_proba(test.drop(columns=["label"]))[:, 1]

In [19]:
log_loss(test["label"], prediction)

0.6904104407011591

In [20]:
roc_auc_score(test["label"], prediction)

0.5483032637925953

In [22]:
#####################################

# Save the Dataset

In [None]:
user = os.environ["user"]

In [36]:
path=f"/home/{user}/persistent_drive/pytorch-tensorfm/data/test-datasets/random_binary_function"

In [37]:
df[train_st:train_end].to_csv(f"{path}/train.csv", index=False)

In [38]:
df[test_st:test_end].to_csv(f"{path}/test.csv", index=False)

In [39]:
df[validation_st:validation_end].to_csv(f"{path}/validation.csv", index=False)