In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
# Load dataset
file_path = r"C:\Users\Dell\Downloads\data.csv\data.csv\updated_file.csv"
df = pd.read_csv(file_path)

# Drop rows where 'product_name' repeats more than 10 times
search_term_counts = df['product_name'].value_counts()
repeating_terms = search_term_counts[search_term_counts > 10].index
df = df[~df['product_name'].isin(repeating_terms)]

# Initial feature selection
initial_features = [
    'product_atcs_30_days', 'product_atcs_plt_30_days', 
    'total_unique_orders_plt_30_days', 'product_ctr_city_30_days', 
    'query_product_similarity',  # Existing features
    'product_variant_id', 'brand_name', 'subcategory_name', 'category_name', 'city_id'  # Categorical features
]

target_column = 'total_unique_orders'

# Filter dataset to include selected features and target
df = df[initial_features + [target_column]]

# Fill missing values with median for numerical columns
df.fillna(df.median(numeric_only=True), inplace=True)

# Remove top and bottom 5% outliers in the target column
lower_bound = df[target_column].quantile(0.05)
upper_bound = df[target_column].quantile(0.95)
df = df[(df[target_column] >= lower_bound) & (df[target_column] <= upper_bound)]

# Check dataset
print(f"Dataset shape after preprocessing: {df.shape}")
print(df.head())

# Define categorical features for encoding
categorical_features = ['product_variant_id', 'brand_name', 'subcategory_name', 'category_name', 'city_id']

# Limit encoding for high-cardinality categories
for col in categorical_features:
    if col in df.columns:  # Ensure the column exists in the DataFrame
        top_categories = df[col].value_counts().nlargest(10).index
        df[col] = np.where(df[col].isin(top_categories), df[col], 'Other')

# Sparse encoding for memory efficiency
encoder = OneHotEncoder(drop='first', sparse_output=True)
X_categorical_sparse = encoder.fit_transform(df[categorical_features])


# Numerical features and feature engineering
X_numerical = df[
    [
        'product_atcs_30_days',
        'product_atcs_plt_30_days',
        'total_unique_orders_plt_30_days',
        'product_ctr_city_30_days',
        'query_product_similarity',
    ]
].copy()

# Feature engineering
X_numerical['atcs_interaction'] = X_numerical['product_atcs_30_days'] * X_numerical['product_atcs_plt_30_days']
X_numerical['log_total_unique_orders'] = np.log(X_numerical['total_unique_orders_plt_30_days'] + 1)
X_numerical['ctr_similarity_interaction'] = (
    X_numerical['product_ctr_city_30_days'] * X_numerical['query_product_similarity']
)
X_numerical['query_similarity_squared'] = X_numerical['query_product_similarity'] ** 2

# Combine numerical and categorical features
X = np.hstack([X_categorical_sparse.toarray(), X_numerical])
y = df[target_column]  # Use the target column defined earlier


Dataset shape after preprocessing: (65258, 11)
    product_atcs_30_days  product_atcs_plt_30_days  \
3                      0                         0   
11                     0                         0   
18                   899                      2647   
24                   523                       523   
35                    21                        27   

    total_unique_orders_plt_30_days  product_ctr_city_30_days  \
3                                 0                  0.000000   
11                                0                  0.098987   
18                             1222                  0.029708   
24                              246                  0.039907   
35                               19                  0.019928   

    query_product_similarity                    product_variant_id brand_name  \
3                   0.213126  13281002-f910-4ab9-8574-80ae35dc6fb9    Boldfit   
11                  0.148046  3aa8671a-efc4-4900-97cc-442e761a91f6    Nandi

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [6]:
import xgboost as xgb

# Prepare data
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define parameters
params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.35,
    'max_depth': 5,
    'subsample': 0.5,
    'colsample_bytree': 0.5,
    'lambda': 40,
    'alpha': 0.6,
    'gamma': 5.0,
    'min_child_weight': 5,
      
}

# Perform Cross-Validation
cv_results = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=4500,#4500
    nfold=5,  #5, r - 25
    metrics=['mae'],  # Metric to evaluate
    early_stopping_rounds=950,  # 950
    seed=42
)

# Extract the best number of rounds
best_num_boost_round = len(cv_results)
print(f"Best number of boosting rounds: {best_num_boost_round}")

# Train final model using the best number of rounds
xgb_model = xgb.train(params, dtrain, num_boost_round=best_num_boost_round)

# Predict and evaluate on test set
y_pred_xgb = xgb_model.predict(dtest)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"Official XGBoost R^2: {r2_xgb}")


Best number of boosting rounds: 4500
Official XGBoost R^2: 0.9945490956306458


In [7]:
from sklearn.metrics import r2_score, mean_absolute_error
# Calculate MSE, R², and MAE
r2_xgb = r2_score(y_test, y_pred_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)

print(f"Official XGBoost R^2: {r2_xgb}")
print(f"Official XGBoost MAE: {mae_xgb}")

Official XGBoost R^2: 0.9945490956306458
Official XGBoost MAE: 2.854820966720581
