In [None]:
# Install necessary libraries
!pip install xgboost lightgbm



In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
pip install dask[dataframe]

Collecting dask-expr<1.2,>=1.1 (from dask[dataframe])
  Downloading dask_expr-1.1.15-py3-none-any.whl.metadata (2.5 kB)
INFO: pip is looking at multiple versions of dask-expr to determine which version is compatible with other requirements. This could take a while.
  Downloading dask_expr-1.1.14-py3-none-any.whl.metadata (2.5 kB)
  Downloading dask_expr-1.1.13-py3-none-any.whl.metadata (2.5 kB)
  Downloading dask_expr-1.1.12-py3-none-any.whl.metadata (2.5 kB)
  Downloading dask_expr-1.1.11-py3-none-any.whl.metadata (2.5 kB)
  Downloading dask_expr-1.1.10-py3-none-any.whl.metadata (2.5 kB)
Downloading dask_expr-1.1.10-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.2/242.2 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dask-expr
Successfully installed dask-expr-1.1.10


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [None]:
# Load the train and test datasets
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

In [None]:
# Save 'User_ID' and 'Product_ID' from test data for final submission
test_user_product_ids = test_df[['User_ID', 'Product_ID']]

In [None]:
# Combine train and test datasets for consistent preprocessing
test_df['Purchase'] = np.nan
combined_df = pd.concat([train_df, test_df], ignore_index=True)

In [None]:
combined_df['Product_Category_2'] = combined_df['Product_Category_2'].fillna(-2)
combined_df['Product_Category_3'] = combined_df['Product_Category_3'].fillna(-2)


In [None]:
# Encode categorical variables using Label Encoding
categorical_cols = ['Gender', 'Age', 'City_Category', 'Stay_In_Current_City_Years']
for col in categorical_cols:
    le = LabelEncoder()
    combined_df[col] = le.fit_transform(combined_df[col].astype(str))

In [None]:
# Label encoding for 'User_ID' and 'Product_ID'
le_user = LabelEncoder()
le_product = LabelEncoder()
combined_df['User_ID'] = le_user.fit_transform(combined_df['User_ID'])
combined_df['Product_ID'] = le_product.fit_transform(combined_df['Product_ID'])

In [None]:
# Frequency encoding for 'User_ID' and 'Product_ID'
user_freq = combined_df['User_ID'].value_counts().to_dict()
product_freq = combined_df['Product_ID'].value_counts().to_dict()
combined_df['User_ID_Freq'] = combined_df['User_ID'].map(user_freq)
combined_df['Product_ID_Freq'] = combined_df['Product_ID'].map(product_freq)

In [None]:
# Split combined data back into train and test sets
train = combined_df[~combined_df['Purchase'].isna()]
test = combined_df[combined_df['Purchase'].isna()].drop('Purchase', axis=1)
y = train['Purchase']
X = train.drop('Purchase', axis=1)

In [None]:
# Create aggregate features on training data
user_purchase_mean = train.groupby('User_ID')['Purchase'].mean()
product_purchase_mean = train.groupby('Product_ID')['Purchase'].mean()

In [None]:
# Map aggregate features to training data
X['User_Purchase_Mean'] = X['User_ID'].map(user_purchase_mean)
X['Product_Purchase_Mean'] = X['Product_ID'].map(product_purchase_mean)

In [None]:
# Map aggregate features to test data
test['User_Purchase_Mean'] = test['User_ID'].map(user_purchase_mean)
test['Product_Purchase_Mean'] = test['Product_ID'].map(product_purchase_mean)


In [None]:
# For missing values in test data, fill with overall mean
overall_mean_purchase = y.mean()
test['User_Purchase_Mean'] = test['User_Purchase_Mean'].fillna(overall_mean_purchase)
test['Product_Purchase_Mean'] = test['Product_Purchase_Mean'].fillna(overall_mean_purchase)


In [None]:
# Split training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
lgb_model = lgb.LGBMRegressor(objective='regression', random_state=42)

In [None]:
param_dist = {
    'n_estimators': [1000, 1500],
    'learning_rate': [0.01, 0.05],
    'num_leaves': [31, 63],
    'max_depth': [-1, 10],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'reg_lambda': [0, 1],
    'reg_alpha': [0, 1]
}


In [None]:
random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_dist,
    n_iter=5,
    scoring='neg_root_mean_squared_error',
    cv=2,  # Reduce number of CV folds
    verbose=1,
    random_state=42,
    n_jobs=-1
)



In [None]:
random_search.fit(X_train, y_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072104 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1621
[LightGBM] [Info] Number of data points in the train set: 440054, number of used features: 15
[LightGBM] [Info] Start training from score 9266.733955


In [None]:
best_lgb_model_sample = random_search.best_estimator_
print(best_lgb_model_sample)


LGBMRegressor(colsample_bytree=0.8, learning_rate=0.05, n_estimators=1000,
              num_leaves=63, objective='regression', random_state=42,
              reg_alpha=1, reg_lambda=1)


In [None]:
# Best model
best_lgb = random_search.best_estimator_

In [None]:
# Evaluate the model on the validation set
y_pred = best_lgb.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'Optimized LightGBM RMSE: {rmse}')

Optimized LightGBM RMSE: 2447.9279559949296


In [None]:
# Train the best model on the full training data
# Recreate aggregate features on full training data
user_purchase_mean_full = train.groupby('User_ID')['Purchase'].mean()
product_purchase_mean_full = train.groupby('Product_ID')['Purchase'].mean()

In [None]:
X['User_Purchase_Mean'] = X['User_ID'].map(user_purchase_mean_full)
X['Product_Purchase_Mean'] = X['Product_ID'].map(product_purchase_mean_full)

In [None]:
best_lgb.fit(X, y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.138859 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1620
[LightGBM] [Info] Number of data points in the train set: 550068, number of used features: 15
[LightGBM] [Info] Start training from score 9263.968713


In [None]:
# Predict on the test dataset
test_predictions = best_lgb.predict(test)

In [None]:
# Prepare the submission dataframe
submission = pd.DataFrame({
    'User_ID': test_user_product_ids['User_ID'],
    'Product_ID': test_user_product_ids['Product_ID'],
    'Purchase': test_predictions
})

In [None]:
# Save the submission file locally
submission.to_csv('optimized_purchase_predictions.csv', index=False)

# Download the CSV file to your local machine
from google.colab import files
files.download('optimized_purchase_predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

NameError: name 'df' is not defined