<a href="https://colab.research.google.com/github/umairaalvi4843-hub/AdIntel/blob/main/AdIntel_Full_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ============================================
# 1. SETUP AND IMPORTS
# ============================================
print("--- [1/8] Installing libraries ---")
# Install the correct libraries for this environment
!pip install lightgbm joblib

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score
import joblib
import gc
from google.colab import drive

# ============================================
# 2. MOUNT GOOGLE DRIVE
# ============================================
print("\n--- [2/8] Mounting Google Drive ---")
# This connects your Colab notebook to your Google Drive
try:
    drive.mount('/content/drive')
except:
    print("Drive already mounted.")

# ============================================
# 3. LOAD DATA
# ============================================
print("\n--- [3/8] Loading data from Drive ---")
# Define the path to your data in Google Drive
# Make sure to update this path if your folder is named differently
file_path = '/content/drive/MyDrive/AdIntel_Data/train.csv'

# We load 3 million rows (Colab can handle this)
# We also select a smart set of columns for our advanced model
cols_to_use = [
    'click', 'hour', 'banner_pos', 'site_category', 'app_category',
    'device_type', 'device_conn_type', 'device_id', 'C1', 'C14',
    'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'
]
df = pd.read_csv(file_path, usecols=cols_to_use, nrows=3000000)
print("Data loaded. Shape:", df.shape)

# ============================================
# 4. FEATURE ENGINEERING (BASIC)
# ============================================
print("\n--- [4/8] Creating basic features ---")
# Sort by time, which is critical for historical features
df = df.sort_values('hour')

# Create hour_of_day
df['hour_of_day'] = df['hour'].astype(str).str[-2:].astype('category') # Optimized

# Create user_ad_count (the correct, non-leaky way)
df['user_ad_count'] = df.groupby('device_id').cumcount()

# ============================================
# 5. SPLIT DATA (To prevent data leakage)
# ============================================
print("\n--- [5/8] Splitting data into train and test sets ---")
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['click']
)

# Free up memory by deleting the large 'df'
del df
gc.collect()
print("Data split complete. Training shape:", train_df.shape, "Testing shape:", test_df.shape)

# ============================================
# 6. FEATURE ENGINEERING (ADVANCED - TARGET ENCODING)
# ============================================
print("\n--- [6/8] Performing Target Encoding ---")
target = 'click'
# Calculate the overall click rate of our training data
global_mean = train_df[target].mean()

# Define the features we want to encode
high_card_features = [
    'banner_pos', 'site_category', 'app_category', 'device_id',
    'device_type', 'device_conn_type', 'C1', 'C14', 'C15', 'C16',
    'C17', 'C18', 'C19', 'C20', 'C21'
]

# Create the "reputation" map from the training data ONLY
encoding_maps = {}
for col in high_card_features:
    encoding_maps[col] = train_df.groupby(col)[target].mean()

# Apply the learned encodings to both training and testing sets
def apply_target_encoding(df, encoding_maps, global_mean):
    df_encoded = df.copy()
    for col, mapping in encoding_maps.items():
        new_col_name = f"{col}_encoded"
        df_encoded[new_col_name] = df_encoded[col].map(mapping)
        df_encoded[new_col_name] = df_encoded[new_col_name].fillna(global_mean)
    return df_encoded

train_df_encoded = apply_target_encoding(train_df, encoding_maps, global_mean)
test_df_encoded = apply_target_encoding(test_df, encoding_maps, global_mean)

# --- Define Final Training Sets ---
features_to_drop = [target, 'hour'] + high_card_features
features = [col for col in train_df_encoded.columns if col not in features_to_drop]

X_train = train_df_encoded[features]
y_train = train_df_encoded[target]
X_test = test_df_encoded[features]
y_test = test_df_encoded[target]

print("Feature Engineering complete!")

# ============================================
# 7. TRAIN HIGH-PERFORMANCE MODEL
# ============================================
print("\n--- [7/8] Training high-performance model ---")
model = lgb.LGBMClassifier(
    objective='binary',
    metric='logloss',
    n_estimators=1500,
    learning_rate=0.03,
    num_leaves=31,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train, categorical_feature=['hour_of_day']) # 'hour_of_day' is our only remaining category

print("Model training complete!")

# --- Evaluate the Model ---
print("\nEvaluating model performance...")
y_pred_proba = model.predict_proba(X_test)[:, 1]
loss = log_loss(y_test, y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)

print("--- Final Model Performance ---")
print(f"Final LogLoss: {loss:.4f}")
print(f"Final AUC Score: {auc:.4f}")

# ============================================
# 8. SAVE THE FINAL MODEL
# ============================================
print("\n--- [8/8] Saving model to file in Google Drive ---")
drive_path = '/content/drive/MyDrive/AdIntel_Data/'
joblib.dump(model, drive_path + 'adintel_model.pkl')
print("Model saved successfully as 'adintel_model.pkl' in your Google Drive.")

--- [1/8] Installing libraries ---

--- [2/8] Mounting Google Drive ---
Mounted at /content/drive

--- [3/8] Loading data from Drive ---
Data loaded. Shape: (3000000, 17)

--- [4/8] Creating basic features ---

--- [5/8] Splitting data into train and test sets ---
Data split complete. Training shape: (2400000, 19) Testing shape: (600000, 19)

--- [6/8] Performing Target Encoding ---
Feature Engineering complete!

--- [7/8] Training high-performance model ---
[LightGBM] [Info] Number of positive: 410886, number of negative: 1989114
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.353172 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 2400000, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.171203 -> initscore=-1.577129
[LightGBM] [Info] Start trai

In [2]:
import joblib
import os
import lightgbm as lgb

# Define the path to your model file saved in Google Drive
model_path = '/content/drive/MyDrive/AdIntel_Data/adintel_model.pkl'

# --- 1. Basic Check: File Information ---
if os.path.exists(model_path):
    print("✅ FILE CHECK: Model file found.")
else:
    print("❌ ERROR: Model file not found. Check your Google Drive path.")

# --- 2. Load and Check Model Parameters ---
# Load the model into a variable
loaded_model = joblib.load(model_path)

# Check the key hyperparameters that show it was the successfully tuned model
print("\n--- MODEL DIAGNOSTICS ---")
print(f"Model Type: {type(loaded_model)}")
print(f"n_estimators (Trees Built): {loaded_model.n_estimators}")
print(f"Learning Rate: {loaded_model.learning_rate}")

✅ FILE CHECK: Model file found.

--- MODEL DIAGNOSTICS ---
Model Type: <class 'lightgbm.sklearn.LGBMClassifier'>
n_estimators (Trees Built): 1500
Learning Rate: 0.03
