In [2]:
# Alternative https://www.kaggle.com/code/hughhuyton/multitouch-attribution-modelling
# https://pypi.org/project/ChannelAttribution/

In [None]:
# ------------------------------------beta 1.2------------------------------------
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, TimeDistributed
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt

# Generate a sample dataset
np.random.seed(42)
n_customers = 10000
n_interactions = np.random.randint(2, 16, size=n_customers)
customer_ids = np.concatenate([np.repeat(customer_id, n) for customer_id, n in zip(np.arange(n_customers), n_interactions)])
interaction_dates = np.concatenate([pd.date_range(start='2024-01-01', periods=n, freq='D') for n in n_interactions])
channels = np.random.choice(['Email', 'Social Media', 'Paid Search', 'Direct', 'Organic Search'], len(customer_ids), p=[0.25, 0.25, 0.2, 0.15, 0.15])
conversions = np.random.choice([0, 1], len(customer_ids), p=[0.9, 0.1])
countries = np.random.choice(['USA', 'Canada', 'UK', 'Germany', 'France'], n_customers)

# Repeat countries to match the number of interactions
country_repeated = np.concatenate([np.repeat(country, n) for country, n in zip(countries, n_interactions)])

data = {
    'customer_id': customer_ids,
    'interaction_date': interaction_dates,
    'channel': channels,
    'conversion': conversions,
    'country': country_repeated
}

df = pd.DataFrame(data)

# Sort by customer_id and interaction_date
df = df.sort_values(by=['customer_id', 'interaction_date'])
print(df.tail())
df.to_csv('lt.csv', index=False)

def dda(country):
    country_df = df[df['country'] == country]
    if country_df.empty:
        print(f"No data for {country}. Skipping...\n")
        return

    # Print basic statistics to check interactions
    print(f'\n\n\n')
    print(f'Total number of interactions in {country} dataframe: {len(country_df)}')
    print(f'Number of unique customers: {country_df["customer_id"].nunique()}')
    print(f'Total number of conversions in original dataframe: {country_df["conversion"].sum()}')

    from sklearn.preprocessing import LabelEncoder, OneHotEncoder

    # Encode the channels
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(country_df['channel'])

    onehot_encoder = OneHotEncoder(sparse_output=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

    # Add onehot_encoded channels to the dataframe using .loc
    channel_columns = [f'channel_{i}' for i in range(onehot_encoded.shape[1])]
    country_df.loc[:, channel_columns] = onehot_encoded

    # Prepare sequences of touchpoints
    sequences = []
    sequence_labels = []
    customer_ids_seq = []

    for customer_id in country_df['customer_id'].unique():
        customer_data = country_df[country_df['customer_id'] == customer_id]
        sequences.append(customer_data[channel_columns].values)
        sequence_labels.append(customer_data['conversion'].values)
        customer_ids_seq.append(customer_id)

    # Pad sequences to have the same length
    max_sequence_length = max(len(seq) for seq in sequences)
    X = pad_sequences(sequences, maxlen=max_sequence_length, dtype='float32', padding='post', value=0.0)
    y = pad_sequences(sequence_labels, maxlen=max_sequence_length, dtype='float32', padding='post', value=0.0)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test, customer_ids_train, customer_ids_test = train_test_split(X, y, customer_ids_seq, test_size=0.2, random_state=42)

    # Build the LSTM model
    model = Sequential([
        LSTM(64, input_shape=(max_sequence_length, len(channel_columns)), return_sequences=True),
        Dropout(0.5),
        LSTM(32, return_sequences=True),
        Dropout(0.5),
        TimeDistributed(Dense(1, activation='linear'))
    ])

    model.compile(optimizer='adam', loss='mse', metrics=['mae'])

    model.summary()

    # Train the model
    history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=1)

    # Plot training & validation loss values
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(loc='upper right')

    plt.subplot(1, 2, 2)
    plt.plot(history.history['mae'], label='Train MAE')
    plt.plot(history.history['val_mae'], label='Validation MAE')
    plt.title('Model MAE')
    plt.ylabel('MAE')
    plt.xlabel('Epoch')
    plt.legend(loc='upper right')

    plt.show()

    # Evaluate the model
    loss, mae = model.evaluate(X_test, y_test, verbose=1)
    print(f'Test Loss: {loss}, Test MAE: {mae}')

    # Make predictions on the entire dataset
    y_pred = model.predict(X)

    # Convert predictions to DataFrame with channels
    predictions = []
    for i in range(len(X)):
        customer_id = customer_ids_seq[i]
        for j in range(max_sequence_length):
            if np.sum(X[i][j]) == 0:
                continue
            channel_index = np.argmax(X[i][j])
            channel = label_encoder.inverse_transform([channel_index])[0]
            predictions.append({
                'customer_id': customer_id,
                'interaction': j + 1,
                'channel': channel,
                'predicted_conversion_credit': y_pred[i][j][0]
            })

    predictions_df = pd.DataFrame(predictions)
    print(predictions_df.head(10))

    # Save predictions to a CSV file
    predictions_df.to_csv(f'predictions_with_channels_{country}.csv', index=False)

    # Display the results with channels and customer_id
    print(predictions_df.head(10))

    # Group by channel and sum the predicted conversion credits
    channel_credit = predictions_df.groupby('channel')['predicted_conversion_credit'].sum().reset_index()

    # Normalize the credits (optional)
    channel_credit['normalized_credit'] = channel_credit['predicted_conversion_credit'] / channel_credit['predicted_conversion_credit'].sum()

    channel_credit.to_csv(f'channel_credit_{country}.csv', index=False)

    print(channel_credit)

    # Additional checks
    original_conversions = df['conversion'].sum()

    print(f'Original conversions: {original_conversions}')
    print(f'Number of unique customers in test set: {len(set(customer_ids_test))}')
    print(f'Total number of interactions in test set: {len(customer_ids_test)}')
    print(f'Total number of predictions: {len(predictions_df)}')

country_list = ['Canada', 'France', 'Germany', 'UK', 'USA']

for country in country_list:
   dda(country)
