In [2]:
import numpy as np
import pandas as pd

In [3]:
# configuration
num_fields = 8                # number of fields in the dataset
num_rows = 1000000            # the number of rows to generate in the dataset
mean_num_features = 20        # average number of features in each field
n_subsets = num_fields ** 2   # number of field subsets to consider for interactions
nonzero_pct = 0.8             # mean percentage of non-zero features per field. We assume that 'zero' is some special value, such as "Missing"

In [4]:
# train, test, validation rel. sizes and indices inside the single common dataframe
train_size = 0.8
test_size = 0.1
validation_size = 0.1

train_st = 0
train_end = int(train_size * num_rows)
test_st = (num_rows-int(test_size * num_rows))
test_end = num_rows
validation_st = (num_rows-int((validation_size + test_size) * num_rows))
validation_end = test_st

In [5]:
rng = np.random.default_rng(42)

In [6]:
# generate number of features per field
num_features = [
    1 + rng.geometric(1. / mean_num_features) for _ in range(num_fields)
]
num_features

[48, 47, 48, 7, 3, 30, 29, 62]

In [7]:
# generate fields
cols = []
for n in num_features:
  features = rng.binomial(1, nonzero_pct, size=num_rows)
  if n > 2:
    half = n // 2
    rest = n - half
    features *= rng.integers(0, half, size=num_rows) + rng.integers(1, rest + 1, size=num_rows)
  cols.append(features.astype(np.int32))

In [8]:
# choose field subsets for interaction. This will affect the labels
subsets = []
for i in range(n_subsets):
  size = rng.integers(2, 4)
  subset = rng.choice(num_fields, size=size, replace=False)
  subsets.append(subset)

In [9]:
# generate feature effects per field
feature_effects = []
for n in num_features:
  feature_effects.append(rng.standard_exponential(size=n))

In [10]:
# generate indicators of positiveness of the cumulative effect of each subset.
subset_thresholding = []
for subset in subsets:
  effect_agg = np.zeros(num_rows)
  for field in subset:
    effect_agg = np.maximum(effect_agg, feature_effects[field][cols[field]])
  subset_thresholding.append(np.asarray(effect_agg > 2, dtype=np.float32))

In [11]:
# labels are randomly generated
prob = np.clip(np.column_stack(subset_thresholding).mean(axis=-1), a_min=1e-5, a_max=1-1e-5)
label = rng.binomial(1, prob)

In [12]:
np.mean(prob), np.mean(label)

(0.19604313, 0.195608)

In [13]:
# the naive loss of a predictor that predicts the average CTR for all events
naive_loss = -np.mean(label) * np.log(np.mean(label)) - (1 - np.mean(label)) * np.log(1 - np.mean(label))
naive_loss

0.4942532035777099

In [14]:
# the best achievable loss is the loss of an oracle that knows the ``true''
# probability of each event. Should be significantly lower than the naive loss,
# so that there is something to actually learn.
optimal_loss = -label * np.log(prob) - (1 - label) * np.log(1 - prob)
optimal_loss.mean()

0.32664252463672233

In [15]:
shift = 0
for i, n in enumerate(num_features):
  cols[i] = cols[i] + shift
  shift += n

col_names = [f"f{i}" for i in range(num_fields)]
df = pd.DataFrame(np.column_stack(cols), columns=col_names)
df["label"] = label

In [16]:
df

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,label
0,34,69,133,147,151,168,183,224,0
1,41,80,120,144,151,167,200,245,0
2,13,48,121,147,152,167,186,227,0
3,0,92,95,146,152,182,190,240,0
4,25,73,95,143,152,153,200,260,0
...,...,...,...,...,...,...,...,...,...
999995,5,64,124,148,152,180,197,219,0
999996,12,65,122,146,150,167,188,247,0
999997,29,48,124,148,152,170,202,212,1
999998,21,83,120,148,150,153,192,265,0


In [17]:
#df.to_csv("dataset.csv", index=False)

In [18]:
# Find the overall maximum value in the DataFrame
overall_max = df.max().max()
print(overall_max)

# Find the row containing the overall maximum value
max_row = df[df.eq(overall_max).any(axis=1)].head(1)
print(max_row)
print(max_row.index[0])

273
     f0  f1   f2   f3   f4   f5   f6   f7  label
661  29  78  105  148  151  171  197  273      1
661


In [21]:
train_df = df[train_st:train_end]

In [22]:
if max_row.index[0] >= train_end:
    train_df = pd.concat([df[train_st:train_end],max_row])

In [32]:
path="/home/viderman/persistent_drive/pytorch-tensorfm/data/test-datasets/triple-dataset"

In [33]:
train_df.to_csv(f"{path}/train.csv", index=False)

In [34]:
df[test_st:test_end].to_csv(f"{path}/test.csv", index=False)

In [35]:
df[validation_st:validation_end].to_csv(f"{path}/validation.csv", index=False)

# Sanity - verify that logistic regression's log-loss is between the naive and the optimal

In [27]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import log_loss

In [28]:
pipeline = make_pipeline(
    make_column_transformer(
        (OneHotEncoder(), col_names),
        remainder="passthrough"
    ),
    LogisticRegressionCV()
)

In [29]:
train, test = train_test_split(df, train_size=0.8)

In [30]:
pipeline = pipeline.fit(train.drop(columns=["label"]), train["label"])

In [31]:
log_loss(test["label"], pipeline.predict_proba(test.drop(columns=["label"]))[:, 1])

0.35988299246501126