In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv(r'D:\AB Testing\data\test_data_A.csv\test_data_A.csv')

# Take the first half of the dataset (500,000 rows if total rows = 1,000,000)
df_half = df.head(500000)

# Save the sampled half (optional)
df_half.to_csv('half_of_data.csv', index=False)


In [7]:
# Read the dataset using the correct delimiter
df = pd.read_csv(r'D:\AB Testing\data\half_of_data.csv', delimiter='|')

# Check the column names again
print("Columns in dataset:", df.columns)


Columns in dataset: Index(['id', 'uid', 'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id',
       'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags',
       'app_first_class', 'app_second_class', 'age', 'city', 'city_rank',
       'device_name', 'device_size', 'career', 'gender', 'net_type',
       'residence', 'his_app_size', 'his_on_shelf_time', 'app_score',
       'emui_dev', 'list_time', 'device_price', 'up_life_duration',
       'up_membership_grade', 'membership_life_duration', 'consume_purchase',
       'communication_onlinerate', 'communication_avgonline_30d', 'indu_name',
       'pt_d'],
      dtype='object')


In [11]:
# Step 1: Identify columns that should be numeric
numerical_columns = ['age', 'city_rank', 'app_score', 'device_price', 
                     'up_life_duration', 'up_membership_grade', 
                     'membership_life_duration', 'communication_onlinerate', 
                     'communication_avgonline_30d']

# Step 2: Check for any non-numeric values in the numerical columns
for col in numerical_columns:
    # Check if any value is not a number
    non_numeric_rows = df[~df[col].apply(pd.to_numeric, errors='coerce').notna()]
    
    if not non_numeric_rows.empty:
        print(f"Non-numeric values found in {col}:")
        print(non_numeric_rows[col].head())


# Step 1: Replace the '^' separated values with the first number in the sequence
df['communication_onlinerate'] = df['communication_onlinerate'].apply(lambda x: x.split('^')[0] if isinstance(x, str) else x)

# Step 2: Convert the column to numeric
df['communication_onlinerate'] = pd.to_numeric(df['communication_onlinerate'], errors='coerce')

# Step 3: Handle missing values (NaN) in the column
df['communication_onlinerate'].fillna(df['communication_onlinerate'].median(), inplace=True)

# Step 4: Proceed with the rest of the preprocessing (scaling, etc.)
scaler = StandardScaler()
df[['communication_onlinerate']] = scaler.fit_transform(df[['communication_onlinerate']])

print("Data cleaned successfully!")


# Step 3: Fix the issue
# For example, if 'city_rank' has malformed entries like '3^4^5^6...', we can remove those rows or replace them with NaN
df['city_rank'] = pd.to_numeric(df['city_rank'], errors='coerce')

# Optionally, fill NaN values with the median or drop rows with NaN values
df['city_rank'].fillna(df['city_rank'].median(), inplace=True)

# Step 4: Proceed with the rest of the preprocessing
# Now continue with scaling or other preprocessing steps
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

print("Data cleaned successfully!")


Non-numeric values found in communication_onlinerate:
0    3^4^5^6^7^8^9^10^11^12^13^14^15^16^17^18^19^20...
1      7^8^9^10^11^12^13^14^15^16^17^18^19^20^21^22^23
2    5^6^7^8^9^10^11^12^13^14^15^16^17^18^19^20^21^...
3    6^7^8^9^10^11^12^13^14^15^16^17^18^19^20^21^22^23
4      7^8^9^10^11^12^13^14^15^16^17^18^19^20^21^22^23
Name: communication_onlinerate, dtype: object


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['communication_onlinerate'].fillna(df['communication_onlinerate'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['city_rank'].fillna(df['city_rank'].median(), inplace=True)


Data cleaned successfully!
Data cleaned successfully!


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# ===========================
# 1. Categorical Encoding
# ===========================

# List of categorical columns
cat_cols = ['creat_type_cd', 'adv_prim_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 
            'app_first_class', 'app_second_class', 'city', 'city_rank', 'device_name', 
            'career', 'gender', 'net_type', 'residence', 'emui_dev', 'indu_name', 'pt_d']

# One-Hot Encoding for low-cardinality columns
# One-Hot Encoding for low-cardinality columns
ohe_cols = ['gender', 'net_type', 'city_rank']
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe_df = pd.DataFrame(ohe.fit_transform(df[ohe_cols]), columns=ohe.get_feature_names_out(ohe_cols))
df = pd.concat([df.drop(columns=ohe_cols), ohe_df], axis=1)


# Label Encoding for high-cardinality columns
le_cols = list(set(cat_cols) - set(ohe_cols))
for col in le_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

# ===========================
# 2. Feature Scaling
# ===========================

# List of numeric columns
num_cols = ['age', 'device_size', 'his_app_size', 'his_on_shelf_time', 'app_score', 
            'list_time', 'device_price', 'up_life_duration', 'up_membership_grade', 
            'membership_life_duration', 'communication_onlinerate', 'communication_avgonline_30d']

# Standard Scaling
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# ===========================
# 3. Target Variable
# ===========================

# Define target and features
X = df.drop(columns=['consume_purchase'])
y = df['consume_purchase']

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ===========================
# 4. Model Training
# ===========================

# Random Forest Classifier (baseline model)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# ===========================
# 5. Model Evaluation
# ===========================

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))


Accuracy: 0.8877
              precision    recall  f1-score   support

           2       0.89      1.00      0.94     87866
           3       0.88      0.04      0.07       917
           4       1.00      0.05      0.10       111
           5       0.99      0.05      0.09      5667
           6       0.90      0.05      0.10       169
           7       0.80      0.06      0.11      1014
           8       0.89      0.05      0.09       664
           9       0.92      0.05      0.09       240
          10       0.64      0.15      0.24      3352

    accuracy                           0.89    100000
   macro avg       0.88      0.17      0.20    100000
weighted avg       0.89      0.89      0.84    100000



In [15]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Define features and target
X = df.drop(columns=['consume_purchase'])
y = df['consume_purchase']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Apply SMOTE to training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check resampled class distribution
print("Resampled class distribution:")
print(y_train_resampled.value_counts())


Resampled class distribution:
consume_purchase
2     351356
5     351356
3     351356
10    351356
7     351356
6     351356
4     351356
9     351356
8     351356
Name: count, dtype: int64


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Define and train the model
model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.88744
              precision    recall  f1-score   support

           2       0.89      1.00      0.94     87839
           3       0.96      0.05      0.10       926
           4       1.00      0.05      0.10        95
           5       0.96      0.05      0.10      5629
           6       0.92      0.07      0.13       175
           7       0.88      0.06      0.12      1065
           8       0.87      0.08      0.15       650
           9       1.00      0.07      0.14       241
          10       0.67      0.14      0.23      3380

    accuracy                           0.89    100000
   macro avg       0.91      0.18      0.22    100000
weighted avg       0.89      0.89      0.84    100000



In [37]:
# Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

# Load the dataset
# Ensure that the delimiter is correctly set to match your data
file_path = r"D:\AB Testing\data\half_of_data.csv"
df = pd.read_csv(file_path, delimiter='|')

# Print shape and column names
print("Shape of dataset:", df.shape)
print("Columns in dataset:", df.columns)

# Handle concatenated or string columns if needed
for col in df.columns:
    # Check if the column contains non-numeric data
    if df[col].dtype == 'object':
        # If the column has concatenated values, split and take the first one
        df[col] = df[col].astype(str).str.split('^').str[0]
        # Attempt to convert to numeric if possible
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop any columns that still have NaN or non-numeric data after conversion
df = df.dropna()

# Check for final non-numeric columns
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns
if len(non_numeric_cols) > 0:
    print(f"Non-numeric columns found: {non_numeric_cols}")
    # Label encode remaining categorical columns
    le = LabelEncoder()
    for col in non_numeric_cols:
        df[col] = le.fit_transform(df[col].astype(str))

# Define features and target
X = df.drop(columns=['consume_purchase'])
y = df['consume_purchase']

# Check target distribution
print("Original class distribution:\n", y.value_counts())

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check resampled class distribution
print("Resampled class distribution:\n", y_train_resampled.value_counts())

# Initialize and train RandomForest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("📊 Classification Report:\n", classification_report(y_test, y_pred))


Shape of dataset: (500000, 36)
Columns in dataset: Index(['id', 'uid', 'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id',
       'dev_id', 'inter_type_cd', 'slot_id', 'spread_app_id', 'tags',
       'app_first_class', 'app_second_class', 'age', 'city', 'city_rank',
       'device_name', 'device_size', 'career', 'gender', 'net_type',
       'residence', 'his_app_size', 'his_on_shelf_time', 'app_score',
       'emui_dev', 'list_time', 'device_price', 'up_life_duration',
       'up_membership_grade', 'membership_life_duration', 'consume_purchase',
       'communication_onlinerate', 'communication_avgonline_30d', 'indu_name',
       'pt_d'],
      dtype='object')
Original class distribution:
 consume_purchase
2     439195
5      28145
10     16900
7       5323
3       4629
8       3251
9       1205
6        876
4        476
Name: count, dtype: int64
Resampled class distribution:
 consume_purchase
2     351356
5     351356
3     351356
10    351356
7     351356
6     351356
4     351356
9

In [42]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# ✅ Load the dataset with correct delimiter
file_path = r"D:\AB Testing\data\half_of_data.csv"  # Update your file path
df = pd.read_csv(file_path, sep='|')  # Correct delimiter for your file

# ✅ Clean and check column names
df.columns = df.columns.str.strip()  # Remove any leading/trailing spaces

# ✅ Drop unnecessary columns if they exist
drop_cols = ['pt_d', 'id']  # Drop 'pt_d' and 'id' if they exist
df = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')

# ✅ Identify non-numeric columns to fix
non_numeric_cols = df.select_dtypes(include=['object']).columns
print("Non-numeric columns to fix:\n", non_numeric_cols)

# ✅ Convert problematic string columns to numeric (if possible)
for col in non_numeric_cols:
    try:
        df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric if possible
    except:
        pass

# ✅ Handle remaining non-numeric columns using LabelEncoder
le = LabelEncoder()
for col in non_numeric_cols:
    if df[col].dtype == 'object':  # Apply LabelEncoder only on remaining string columns
        df[col] = le.fit_transform(df[col].astype(str))

# ✅ Drop any rows with NaN after conversion
df = df.dropna()

# ✅ Define features (X) and target (y)
X = df.drop(columns=['consume_purchase'], errors='ignore')
y = df['consume_purchase']

# ✅ Encode target labels using LabelEncoder
le_y = LabelEncoder()
y = le_y.fit_transform(y)  # Map labels to [0, 1, 2, ...]

# ✅ Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# ✅ Fix SMOTE by dynamically adjusting k_neighbors
min_class_count = np.min(np.bincount(y_train))  # Smallest class size
k_neighbors = min(5, min_class_count - 1) if min_class_count > 1 else 1

# ✅ Apply SMOTE with dynamically adjusted k_neighbors
smote = SMOTE(k_neighbors=k_neighbors, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# ✅ Check resampled class distribution
print("Resampled class distribution:\n", pd.Series(y_train_resampled).value_counts())

# ✅ Initialize XGBoost classifier with optimal parameters
model = XGBClassifier(
    n_estimators=500,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="mlogloss"
)

# ✅ Fit the model on resampled data
model.fit(X_train_resampled, y_train_resampled)

# ✅ Make predictions
y_pred = model.predict(X_test)

# ✅ Decode predictions back to original classes
y_pred_original = le_y.inverse_transform(y_pred)
y_test_original = le_y.inverse_transform(y_test)

# ✅ Evaluate model performance
print("\nAccuracy:", accuracy_score(y_test_original, y_pred_original))
print("\nClassification Report:\n", classification_report(y_test_original, y_pred_original))


Non-numeric columns to fix:
 Index(['communication_onlinerate'], dtype='object')
Resampled class distribution:
 0    2753
2    2753
7    2753
3    2753
6    2753
4    2753
1    2753
5    2753
Name: count, dtype: int64

Accuracy: 0.9404600811907984

Classification Report:
               precision    recall  f1-score   support

           2       0.95      0.99      0.97       688
           3       1.00      0.33      0.50         3
           5       0.67      0.17      0.28        23
           6       0.00      0.00      0.00         1
           7       1.00      0.60      0.75         5
           8       0.50      0.20      0.29         5
           9       0.00      0.00      0.00         1
          10       0.44      0.31      0.36        13

    accuracy                           0.94       739
   macro avg       0.57      0.33      0.39       739
weighted avg       0.93      0.94      0.93       739



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [1]:
# ✅ Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import ADASYN, SMOTE
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
import joblib  # To save the model for real-time use

# ✅ Load the dataset with correct delimiter
file_path = r"D:\AB Testing\data\half_of_data.csv"  # Update your file path
df = pd.read_csv(file_path, sep='|')  # Correct delimiter as mentioned before

# ✅ Clean and check column names
df.columns = df.columns.str.strip()  # Strip leading/trailing spaces from columns

# ✅ Drop unnecessary columns if they exist
drop_cols = ['pt_d', 'id']  # Drop columns if present
df = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')

# ✅ Handle non-numeric columns dynamically
non_numeric_cols = df.select_dtypes(include=['object']).columns
print("Non-numeric columns to fix:\n", non_numeric_cols)

# ✅ Convert problematic string columns to numeric if possible
for col in non_numeric_cols:
    try:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    except:
        pass

# ✅ Label encode remaining non-numeric columns
le = LabelEncoder()
for col in non_numeric_cols:
    if df[col].dtype == 'object':  # Apply LabelEncoder only to remaining object columns
        df[col] = le.fit_transform(df[col].astype(str))

# ✅ Drop rows with NaN after conversion
df = df.dropna()

# ✅ Define features (X) and target (y)
X = df.drop(columns=['consume_purchase'], errors='ignore')
y = df['consume_purchase']

# ✅ Encode target labels using LabelEncoder
le_y = LabelEncoder()
y = le_y.fit_transform(y)

# ✅ Handle cases with only one class (avoid SMOTE error)
if len(np.unique(y)) == 1:
    print("⚠️ Only one class present. Skipping resampling...")
    X_resampled, y_resampled = X, y
else:
    # ✅ Apply ADASYN/SMOTE dynamically based on class balance
    min_class_count = np.min(np.bincount(y))
    k_neighbors = min(5, min_class_count - 1) if min_class_count > 1 else 1

    # Use ADASYN if classes are highly imbalanced
    if min_class_count <= 10:
        print("⚡️ Applying ADASYN due to high imbalance...")
        resampler = ADASYN(n_neighbors=k_neighbors, random_state=42)
    else:
        print("🔄 Applying SMOTE...")
        resampler = SMOTE(k_neighbors=k_neighbors, random_state=42)

    # ✅ Resampling to balance classes
    X_resampled, y_resampled = resampler.fit_resample(X, y)

# ✅ Split resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, stratify=y_resampled, test_size=0.2, random_state=42)

# ✅ Define XGBoost classifier with base params
xgb_model = XGBClassifier(
    objective='multi:softmax',  # Multi-class classification
    eval_metric="mlogloss",
    use_label_encoder=False,
    random_state=42
)

# ✅ Define hyperparameter grid for optimization
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

# ✅ Perform GridSearchCV to find the best parameters
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=2
)

# ✅ Fit GridSearchCV to find the best model
grid_search.fit(X_train, y_train)

# ✅ Get the best estimator
best_model = grid_search.best_estimator_
print(f"🎯 Best Parameters Found: {grid_search.best_params_}")

# ✅ Feature importance analysis for optimization
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_model.feature_importances_
}).sort_values(by='Importance', ascending=False)
print("\n⚡️ Top 10 Important Features:\n", feature_importances.head(10))

# ✅ Save the best model for real-time use
model_path = r"D:\AB Testing\models\best_xgb_model.pkl"
joblib.dump(best_model, model_path)
print(f"✅ Model saved at: {model_path}")

# ✅ Make predictions
y_pred = best_model.predict(X_test)

# ✅ Decode predictions back to original classes
y_pred_original = le_y.inverse_transform(y_pred)
y_test_original = le_y.inverse_transform(y_test)

# ✅ Evaluate model performance
print("\n🎯 Accuracy:", accuracy_score(y_test_original, y_pred_original))
print("\n📊 Classification Report:\n", classification_report(y_test_original, y_pred_original))


Non-numeric columns to fix:
 Index(['communication_onlinerate'], dtype='object')
⚡️ Applying ADASYN due to high imbalance...
Fitting 5 folds for each of 243 candidates, totalling 1215 fits


Parameters: { "use_label_encoder" } are not used.



🎯 Best Parameters Found: {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 500, 'subsample': 0.8}

⚡️ Top 10 Important Features:
                 Feature  Importance
23            app_score    0.098253
28  up_membership_grade    0.096767
27     up_life_duration    0.069389
10      app_first_class    0.065743
26         device_price    0.057308
17               career    0.049832
16          device_size    0.040908
12                  age    0.037515
6         inter_type_cd    0.033784
20            residence    0.032499
✅ Model saved at: D:\AB Testing\models\best_xgb_model.pkl

🎯 Accuracy: 0.9981857764876633

📊 Classification Report:
               precision    recall  f1-score   support

           2       0.99      0.99      0.99       688
           3       1.00      1.00      1.00       689
           5       1.00      0.99      1.00       694
           6       1.00      1.00      1.00       688
           7       1.00      1.00      1.00       689
 