In [2]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, LeakyReLU, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import tensorflow as tf
np.random.seed(42)
tf.random.set_seed(42)



In [3]:
class GAN:
    def __init__(self, latent_dim, data_dim):
        self.latent_dim = latent_dim
        self.data_dim = data_dim

        # Build and compile the discriminator
        self.discriminator = self.build_discriminator()
        self.discriminator.compile(loss='binary_crossentropy',
                                   optimizer=Adam(learning_rate=0.0002, beta_1=0.5),
                                   metrics=['accuracy'])

        # Build the generator
        self.generator = self.build_generator()

        # The generator takes noise as input and generates data
        z = Input(shape=(self.latent_dim,))
        generated_data = self.generator(z)

        # For the combined model, we will only train the generator
        self.discriminator.trainable = False

        # The discriminator takes generated data as input and determines validity
        validity = self.discriminator(generated_data)

        # The combined model (stacked generator and discriminator)
        self.combined = Model(z, validity)
        self.combined.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0001, beta_1=0.5))

        # Reset the discriminator's trainable attribute to True after compiling the combined model
        self.discriminator.trainable = True

        # Debugging: Check trainable weights
        print("Discriminator trainable weights:", self.discriminator.trainable_weights)
        print("Generator trainable weights:", self.generator.trainable_weights)
        print("Combined model trainable weights:", self.combined.trainable_weights)

    def build_generator(self):
        model = Sequential()
        model.add(Input(shape=(self.latent_dim,)))
        model.add(Dense(256))
        model.add(LeakyReLU(negative_slope=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(self.data_dim, activation='sigmoid'))
        model.summary()

        noise = Input(shape=(self.latent_dim,))
        data = model(noise)

        return Model(noise, data)

    def build_discriminator(self):
        model = Sequential()
        model.add(Input(shape=(self.data_dim,)))
        model.add(Dense(256))
        model.add(LeakyReLU(negative_slope=0.2))
        model.add(Dense(1, activation='sigmoid'))
        model.summary()

        data = Input(shape=(self.data_dim,))
        validity = model(data)

        return Model(data, validity)

    def train(self, data, epochs, batch_size):
        valid = np.ones((batch_size, 1))
        fake = np.zeros((batch_size, 1))

        for epoch in range(epochs):
            # Train Discriminator
            idx = np.random.randint(0, data.shape[0], batch_size)
            real_data = data[idx].todense()  # Convert sparse to dense if needed

            noise = np.random.normal(0, 1, (batch_size, self.latent_dim))
            generated_data = self.generator.predict(noise)

            d_loss_real = self.discriminator.train_on_batch(real_data, valid)
            d_loss_fake = self.discriminator.train_on_batch(generated_data, fake)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # Train Generator
            noise = np.random.normal(0, 1, (batch_size, self.latent_dim))
            g_loss = self.combined.train_on_batch(noise, valid)

            # Print the losses
            print(f"{epoch} [D loss: {d_loss}] [G loss: {g_loss}]")


In [4]:
# Load CSV files and merge them
ads_data_path = r'C:\Users\lucas\Downloads\GES Hackathon\decrypted_file\train\train_data_ads.csv'  # Replace with actual file path
feeds_data_path = r'C:\Users\lucas\Downloads\GES Hackathon\decrypted_file\train\train_data_feeds.csv'  # Replace with actual file path

ads_data = pd.read_csv(ads_data_path)
feeds_data = pd.read_csv(feeds_data_path)

In [5]:
# Fractions of data used as loading both of them in full raises a memory allocation error due to large size
sample_ads_data = ads_data.sample(frac=.1, random_state=42)
sample_feeds_data = feeds_data.sample(frac=.1, random_state=42)

# Merge data
combined_data = pd.merge(sample_ads_data, sample_feeds_data, left_on='user_id', right_on='u_userId')
print(combined_data.shape)
combined_data.head()

(7317291, 63)


Unnamed: 0,log_id,label_x,user_id,age,gender,residence,city,city_rank,series_dev,series_group,...,e_ch,e_m,e_po,e_pl,e_rn,e_section,e_et,label_y,cillabel,pro
0,389752,0,192266,2,2,46,354,2,11,8,...,19,320,2,214,2,0,202206080822,-1,-1,0
1,389752,0,192266,2,2,46,354,2,11,8,...,19,320,9,214,6,0,202206032022,-1,-1,0
2,389752,0,192266,2,2,46,354,2,11,8,...,19,320,10,1117,1,0,202206052125,-1,-1,0
3,389752,0,192266,2,2,46,354,2,11,8,...,19,320,4,214,4,0,202206032020,-1,-1,0
4,389752,0,192266,2,2,46,354,2,11,8,...,19,320,6,214,1,1,202206070027,-1,-1,0


In [35]:

numeric_cols = combined_data.select_dtypes(include=['number']).columns
combined_data[numeric_cols] = combined_data[numeric_cols].fillna(combined_data[numeric_cols].mean())

categorical_cols = combined_data.select_dtypes(exclude=['number']).columns
combined_data[categorical_cols] = combined_data[categorical_cols].fillna('Unknown')

numeric_features = []
categorical_features = ['age', 'gender', 'residence', 'city', 'city_rank', 'series_dev', 'series_group', 'residence', 'u_refreshTimes_y',
                        'u_feedLifeCycle_y', 'device_size']


numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[

        ('cat', categorical_transformer, categorical_features)
    ])


preprocessed_data = preprocessor.fit_transform(combined_data)


print(f"Shape of preprocessed data: {preprocessed_data.shape}")

# Extract fitted transformers

fitted_categorical_transformer = preprocessor.named_transformers_['cat']



Shape of preprocessed data: (7317291, 1522)


In [33]:
numeric_features = []
# Calculate indices for synthetic data
total_numeric_features = len(numeric_features)
total_categorical_features = sum(len(categories) for categories in fitted_categorical_transformer.categories_)# Debug: Print the counts
print(f"Total numeric features: {total_numeric_features}")
print(f"Total categorical features (after encoding): {total_categorical_features}")

Total numeric features: 0
Total categorical features (after encoding): 1522


In [19]:
 #Calculate indices for numeric and categorical data
numeric_data_indices = list(range(total_numeric_features))
categorical_data_indices = list(range(total_numeric_features, total_numeric_features + total_categorical_features))

# Debug: Print extracted indices
print(f"Numeric data indices: {numeric_data_indices}")
print(f"Categorical data indices: {categorical_data_indices}")

Numeric data indices: [0, 1]
Categorical data indices: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 

In [34]:
print(f"Shape of synthetic data: {synthetic_data.shape}")

Shape of synthetic data: (100000, 1520)


In [37]:
# Define GAN parameters
latent_dim = 100

# Assuming combined_data is the preprocessed data
# Extract only categorical data for training
preprocessed_categorical_data = preprocessed_data[:, len(numeric_features):]

# Calculate the number of categorical features after one-hot encoding
total_categorical_features = preprocessed_categorical_data.shape[1]
data_dim = total_categorical_features

# Instantiate and train the GAN
gan = GAN(latent_dim, data_dim)
gan.train(preprocessed_categorical_data, epochs=400, batch_size=64)
#Epochs can be increased but due to limited processing power only 400 was used

# Generate synthetic data
num_samples = 100000  # 100k samples used to come close to the size of original 700k fracitonal set 
noise = np.random.normal(0, 1, (num_samples, latent_dim))
synthetic_data = gan.generator.predict(noise)

# Debug: Check the shape of the synthetic data
print(f"Shape of synthetic data: {synthetic_data.shape}")

# Fit the transformer separately (needed for inverse_transform)
fitted_categorical_transformer = OneHotEncoder(handle_unknown='ignore').fit(combined_data[categorical_features])

# Inverse transform categorical features
original_categorical_data = fitted_categorical_transformer.inverse_transform(synthetic_data)

# Combine back into a single DataFrame
original_data_df = pd.DataFrame(original_categorical_data, columns=categorical_features)

# Convert data types for categorical columns
for col in categorical_features:
    original_data_df[col] = original_data_df[col].astype('category')

# Save the DataFrame to a CSV file
original_data_df.to_csv(r'C:\Users\lucas\Downloads\GES Hackathon\decrypted_file\train\synthetic_data1.csv', index=False)

print(original_data_df.head())


Discriminator trainable weights: [<KerasVariable shape=(1522, 256), dtype=float32, path=sequential_42/dense_84/kernel>, <KerasVariable shape=(256,), dtype=float32, path=sequential_42/dense_84/bias>, <KerasVariable shape=(256, 1), dtype=float32, path=sequential_42/dense_85/kernel>, <KerasVariable shape=(1,), dtype=float32, path=sequential_42/dense_85/bias>]
Generator trainable weights: [<KerasVariable shape=(100, 256), dtype=float32, path=sequential_43/dense_86/kernel>, <KerasVariable shape=(256,), dtype=float32, path=sequential_43/dense_86/bias>, <KerasVariable shape=(256,), dtype=float32, path=sequential_43/batch_normalization_21/gamma>, <KerasVariable shape=(256,), dtype=float32, path=sequential_43/batch_normalization_21/beta>, <KerasVariable shape=(256, 1522), dtype=float32, path=sequential_43/dense_87/kernel>, <KerasVariable shape=(1522,), dtype=float32, path=sequential_43/dense_87/bias>]
Combined model trainable weights: [<KerasVariable shape=(100, 256), dtype=float32, path=sequen

In [38]:
combined_data.describe()

Unnamed: 0,log_id,label_x,user_id,age,gender,residence,city,city_rank,series_dev,series_group,...,e_ch,e_m,e_po,e_pl,e_rn,e_section,e_et,label_y,cillabel,pro
count,7317291.0,7317291.0,7317291.0,7317291.0,7317291.0,7317291.0,7317291.0,7317291.0,7317291.0,7317291.0,...,7317291.0,7317291.0,7317291.0,7317291.0,7317291.0,7317291.0,7317291.0,7317291.0,7317291.0,7317291.0
mean,549876.5,0.008491531,193367.9,5.489529,2.37477,26.56742,269.8585,3.412955,24.50233,4.563047,...,17.7823,832.0099,7.751448,1628.133,3.958115,0.2024469,202206100000.0,-0.7985306,-0.9994927,7.429418
std,319012.9,0.09175743,54403.43,2.172962,0.7348371,9.339597,98.51109,1.280022,7.745248,1.935197,...,3.514302,429.6367,4.333345,852.2211,6.623305,0.4018236,20046.63,0.6019542,0.03184848,24.78124
min,2.0,0.0,100006.0,2.0,2.0,11.0,101.0,2.0,11.0,2.0,...,1.0,14.0,1.0,0.0,1.0,0.0,202206000000.0,-1.0,-1.0,0.0
25%,275448.0,0.0,145244.0,3.0,2.0,19.0,178.0,2.0,16.0,3.0,...,19.0,506.0,5.0,888.0,1.0,0.0,202206000000.0,-1.0,-1.0,0.0
50%,547770.0,0.0,193283.0,6.0,2.0,26.0,297.0,3.0,27.0,5.0,...,19.0,841.0,7.0,1674.0,2.0,0.0,202206100000.0,-1.0,-1.0,0.0
75%,823833.0,0.0,240827.0,7.0,2.0,33.0,343.0,5.0,31.0,6.0,...,19.0,1217.0,10.0,2305.0,4.0,0.0,202206100000.0,-1.0,-1.0,0.0
max,1176632.0,1.0,287180.0,9.0,4.0,46.0,441.0,5.0,37.0,8.0,...,20.0,1483.0,26.0,3189.0,99.0,1.0,202206100000.0,1.0,1.0,100.0


In [44]:
df = original_data_df.apply(pd.to_numeric, errors='ignore')

df.describe()

  df = original_data_df.apply(pd.to_numeric, errors='ignore')


Unnamed: 0,age,gender,residence,city,city_rank,series_dev,series_group,residence.1,city_rank.1,u_refreshTimes_y,u_feedLifeCycle_y,device_size
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,5.49534,2.84339,28.52026,256.84344,3.73059,23.36777,4.89981,25.69826,3.5929,5.25389,16.31294,1860.04691
std,2.287368,0.873928,10.145917,98.204549,1.152077,7.608936,1.984956,10.30325,1.114559,3.144457,1.128504,438.743735
min,2.0,2.0,11.0,103.0,2.0,11.0,2.0,11.0,2.0,0.0,10.0,1001.0
25%,3.0,2.0,19.0,168.0,3.0,16.0,3.0,17.0,3.0,2.0,16.0,1524.0
50%,6.0,3.0,27.0,256.0,4.0,23.0,5.0,23.0,4.0,6.0,17.0,1916.0
75%,7.0,4.0,40.0,333.0,5.0,31.0,7.0,33.0,5.0,8.0,17.0,2231.0
max,9.0,4.0,46.0,441.0,5.0,37.0,8.0,46.0,5.0,9.0,17.0,2579.0
