In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

In [7]:
df = pd.read_csv('/content/drive/MyDrive/digital_turbine/train_data.csv')

In [9]:
sent_price = df['sentPrice']
win_price = df['winBid']

In [33]:
# Check how well sent price performs in terms of MSE and RMSE
mse = mean_squared_error(sent_price, win_price)
print(f"Mean Squared Error: {mse}")

rmse = math.sqrt(mean_squared_error(sent_price, win_price))
print(f"Root Mean Squared Error (RMSE): {rmse}")

Mean Squared Error: 379.62816821370325
Root Mean Squared Error (RMSE): 19.484049071322502


In [27]:
df.head()

Unnamed: 0,eventTimestamp,unitDisplayType,brandName,bundleId,appVersion,correctModelName,countryCode,deviceId,osAndVersion,connectionType,c1,c2,c3,c4,size,mediationProviderVersion,bidFloorPrice,sentPrice,winBid,has_won
0,1656411567773,banner,LG,com.tilegarden.match3,1.8.22,LM-V405,US,74f9b473fad,Android-10.0,3G,cb2,6.0,6b,4.0,320x50,11.2.1,0.01,0.02,0.88,0
1,1656925395488,banner,Generic,com.loop.match3d,1245.34.0,Android 4.0,FR,6ad4c88b84e,Android-4.0,WIFI,7d3,6.0,6b,6.0,320x50,11.4.2,0.01,0.03,0.08,0
2,1656913751642,banner,Generic,com.loop.match3d,1245.35.0,Android 4.0,US,743b9849642,Android-4.0,WIFI,7d3,3.0,6b,3.0,320x50,11.4.3,0.01,0.02,1.72,0
3,1656656319103,banner,Generic,com.loop.match3d,1245.34.0,Android 4.0,GB,6ad933115b2,Android-4.0,WIFI,7d3,3.0,6b,5.0,320x50,11.4.2,0.01,0.06,0.21,0
4,1657429389462,interstitial,OPPO,com.loop.match3d,1245.35.0,CPH2127,ID,809f9785bb3,Android-11.0,WIFI,8bd,3.0,6b,3.0,320x480,11.4.3,0.01,0.16,1.91,0


In [60]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import math

# Step 2: Load and preprocess the data
data = pd.read_csv('/content/drive/MyDrive/digital_turbine/train_data.csv')

# Drop the 'has_won' and 'sentPrice' columns (excluded features)
data = data.drop(['has_won'], axis=1)

# Assuming 'eventTimestamp' is in Unix timestamps
data['eventTimestamp'] = pd.to_datetime(data['eventTimestamp'], unit='ms')
data['hour_24hr'] = data['eventTimestamp'].dt.strftime('%H')
data['day_of_week'] = data['eventTimestamp'].dt.dayofweek

# Identify categorical and numerical columns
categorical_columns = ['unitDisplayType', 'brandName', 'bundleId', 'deviceId', 'appVersion', 'correctModelName',
                       'countryCode', 'osAndVersion', 'connectionType', 'c1', 'c2', 'c3', 'c4', 'size',
                       'mediationProviderVersion', 'hour_24hr', 'day_of_week']
numerical_columns = ['bidFloorPrice', 'sentPrice']

# Split the data into training and testing sets
X = data[categorical_columns + numerical_columns + ['winBid']]
y = data['winBid']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Target Encoding function
def target_encode(data, cat_column, means):
    return data[cat_column].map(means)

# Calculate and store means for target encoding
means_dict = {}
for cat_column in categorical_columns:
    means = X_train.groupby(cat_column)['winBid'].mean()
    means_dict[cat_column] = means

# Apply target encoding to the categorical columns on the training data
for cat_column in categorical_columns:
    X_train[cat_column] = target_encode(X_train, cat_column, means_dict[cat_column])

# Apply the target encoding to the categorical columns on the test data using the stored means
for cat_column in categorical_columns:
    X_test[cat_column] = X_test[cat_column].map(means_dict[cat_column]).fillna(y_train.mean())

# Drop 'winBid' from X_train and X_test
X_train = X_train.drop('winBid', axis=1)
X_test = X_test.drop('winBid', axis=1)

In [61]:
# Step 4: Create an XGBoost regressor model
xgb_model = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=50,
    learning_rate=0.05,
    max_depth=15,
    tree_method='hist',
    device='cuda',
    random_state=42
)

# Create a pipeline that includes data preprocessing and the XGBoost model
model = Pipeline(steps=[('model', xgb_model)])

# Step 5: Train the model on the training data
model.fit(X_train, y_train)

# Step 6: Evaluate the model on the testing data
y_pred = model.predict(X_test)

# Calculate the Mean Squared Error (MSE) as a measure of performance
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

rmse = math.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE) on Test Data: {rmse}")

# Step 7: Evaluate the model on the training data
print(f"Train data error")
y_pred_train = model.predict(X_train)

# Calculate the Mean Squared Error (MSE) as a measure of performance
mse = mean_squared_error(y_train, y_pred_train)
print(f"Mean Squared Error: {mse}")

# Calculate the RMSE as a measure of performance
rmse = math.sqrt(mean_squared_error(y_train, y_pred_train))
print(f"Root Mean Squared Error (RMSE) on Train Data: {rmse}")

Mean Squared Error: 110.81543220355387
Root Mean Squared Error (RMSE) on Test Data: 10.526890908694451
Train data error
Mean Squared Error: 72.72599593828598
Root Mean Squared Error (RMSE) on Train Data: 8.527953795506045


In [62]:
merge = pd.DataFrame()
merge['winBid'] = y_test
merge['winBidPredict'] = y_pred

In [63]:
merge.head(10)

Unnamed: 0,winBid,winBidPredict
3978295,0.3,0.767775
6723912,43.81,45.131596
5192975,0.11,0.410329
3132041,0.3,0.493033
6746520,0.29,0.523708
5772356,0.06,0.532307
6366000,8.02,8.306395
5262576,1.0,1.640121
7037603,0.06,0.485406
5869219,1.0,0.669543


In [64]:
# Load test data
test_data = pd.read_csv('/content/drive/MyDrive/digital_turbine/test_data.csv')

In [65]:
test_data.head()

Unnamed: 0,eventTimestamp,unitDisplayType,brandName,bundleId,appVersion,correctModelName,countryCode,deviceId,osAndVersion,connectionType,c1,c2,c3,c4,size,mediationProviderVersion,bidFloorPrice,sentPrice
0,1657758857892,rewarded,Generic,com.loop.match3d,1245.35.0,Android 4 Tablet,US,6a0a94554cf,Android-4.0,3G,7b8,2.0,6b,2.0,768x1024,11.4.3,0.05,0.06
1,1657210707978,interstitial,Generic,com.loop.match3d,1245.35.0,Android 4.0,ZA,6a0b0e59f45,Android-4.0,WIFI,8bd,8.0,6b,6.0,320x480,11.4.3,0.01,0.16
2,1657392939412,interstitial,Motorola,com.YayySAL.DodgeAgent,13.0.02,Moto g(8) power lite,BR,6a0fa820c46,Android-10.0,WIFI,ea0,8.0,6b,9.0,320x480,11.0.0,0.05,0.06
3,1657386816882,interstitial,Huawei,com.tintash.nailsalon,1.3.6,JDN2-AL00HN,IQ,6a142bdbea2,Android-9.0,WIFI,3dc,6.0,6b,4.0,768x1024,11.3.3,0.01,0.05
4,1657211600823,banner,Generic,com.tilegarden.match3,1.8.52,Android 4.0,US,6a16943a771,Android-4.0,WIFI,cb2,7.0,6b,5.0,320x50,11.4.3,0.01,0.03


In [None]:
# Assuming 'eventTimestamp' is in Unix timestamps
test_data['eventTimestamp'] = pd.to_datetime(test_data['eventTimestamp'], unit='ms')  # Convert to datetime
test_data['hour_24hr'] = test_data['eventTimestamp'].dt.strftime('%H').astype(float)  # Extract the hour in 24-hour format
test_data['day_of_week'] = test_data['eventTimestamp'].dt.dayofweek.astype(float)

# Identify categorical and numerical columns
categorical_columns = ['unitDisplayType', 'brandName', 'bundleId', 'deviceId', 'appVersion', 'correctModelName',
                       'countryCode', 'osAndVersion', 'connectionType', 'c1', 'c2', 'c3', 'c4', 'size',
                       'mediationProviderVersion', 'hour_24hr', 'day_of_week']
numerical_columns = ['bidFloorPrice', 'sentPrice']

# Split the data into training and testing sets
df = test_data[categorical_columns + numerical_columns]

# Apply the target encoding to the categorical columns on the test data using the stored means
for cat_column in categorical_columns:
    df[cat_column] = df[cat_column].map(means_dict[cat_column]).fillna(y_train.mean())

In [70]:
winBid = model.predict(df)
test_data = pd.read_csv('/content/drive/MyDrive/digital_turbine/test_data.csv')
test_data['winBid'] = winBid

In [73]:
test_data.head(20)

Unnamed: 0,eventTimestamp,unitDisplayType,brandName,bundleId,appVersion,correctModelName,countryCode,deviceId,osAndVersion,connectionType,c1,c2,c3,c4,size,mediationProviderVersion,bidFloorPrice,sentPrice,winBid
0,1657758857892,rewarded,Generic,com.loop.match3d,1245.35.0,Android 4 Tablet,US,6a0a94554cf,Android-4.0,3G,7b8,2.0,6b,2.0,768x1024,11.4.3,0.05,0.06,23.174673
1,1657210707978,interstitial,Generic,com.loop.match3d,1245.35.0,Android 4.0,ZA,6a0b0e59f45,Android-4.0,WIFI,8bd,8.0,6b,6.0,320x480,11.4.3,0.01,0.16,3.493606
2,1657392939412,interstitial,Motorola,com.YayySAL.DodgeAgent,13.0.02,Moto g(8) power lite,BR,6a0fa820c46,Android-10.0,WIFI,ea0,8.0,6b,9.0,320x480,11.0.0,0.05,0.06,2.914399
3,1657386816882,interstitial,Huawei,com.tintash.nailsalon,1.3.6,JDN2-AL00HN,IQ,6a142bdbea2,Android-9.0,WIFI,3dc,6.0,6b,4.0,768x1024,11.3.3,0.01,0.05,1.390016
4,1657211600823,banner,Generic,com.tilegarden.match3,1.8.52,Android 4.0,US,6a16943a771,Android-4.0,WIFI,cb2,7.0,6b,5.0,320x50,11.4.3,0.01,0.03,1.119156
5,1657681515882,rewarded,Xiaomi,com.loop.match3d,1245.35.0,M2102J20SG,CL,6a16f51a055,Android-11.0,WIFI,7b8,2.0,6b,7.0,320x480,11.4.3,0.05,0.24,2.589774
6,1656831532490,banner,Motorola,com.loop.match3d,1245.34.0,Moto E6 Play,MX,6a17ec40fd7,Android-9.0,WIFI,7d3,6.0,6b,3.0,320x50,11.4.2,0.01,0.02,0.674645
7,1657682026847,banner,Xiaomi,com.kamilbilge.ropesavior3d,1.5.9,Redmi Note 8 Pro,ID,6a1ce05586c,Android-10.0,WIFI,c17,2.0,6b,9.0,320x50,11.3.3,0.05,0.07,0.440275
8,1656819719972,interstitial,Apple,1529614832,1.8.50,iPhone,VN,6a22a17837d,iOS-15.5,WIFI,313,1.0,4b,8.0,320x480,11.4.1,0.01,0.02,8.641707
9,1657001527931,banner,Generic,com.tintash.nailsalon,1.3.6,Android 4.0,SV,6a3302f26de,Android-4.0,WIFI,ad3,4.0,6b,7.0,320x50,11.3.3,0.01,0.04,0.811265


In [74]:
test_data.to_csv('/content/drive/MyDrive/digital_turbine/1/test_data.csv', index=False)