In [1]:
import pyspark
from pyspark import SparkContext
import json
import numpy as np
from xgboost import XGBRegressor
import sys
import time
import pickle
from tqdm import tqdm
from better_features import FeatureProcessor, read_json_data, transform_user_data, transform_business_data
from better_features import extract_review_data
from better_features import extract_business_data
import pandas as pd

def initialize_spark_session(APP_NAME="Train: XGBModel"):
    SPARK_CONF = [
        ("spark.executor.memory", "16g"),
        ("spark.executor.cores", "8"),
        ("spark.python.worker.memory", "8g"),
        ("spark.sql.sources.partitionOverWriteMode", "dynamic"),
        ("spark.driver.memory", "8g"),
    ]

    spark_conf = pyspark.SparkConf()
    spark_conf.setAppName(APP_NAME)
    spark_conf.setAll(SPARK_CONF)

    sc = pyspark.SparkContext(conf=spark_conf)
    sc.setLogLevel("ERROR")

    return sc

def rdd_to_pandas(rdd):
    return pd.DataFrame(rdd.collect(), columns=rdd.first().keys())

sc = initialize_spark_session()

24/04/19 09:50:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/19 09:50:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
## Processed already, change it only if ADDED w/ more Features (User, Bus, Reviews)
# user_rdd = sc.textFile(folder_path + '/user.json').map(transform_user_data)
# user_df = rdd_to_pandas(user_rdd)
# user_df.to_csv('cache/user_df.csv', index=False)
# business_rdd = sc.textFile(folder_path + '/business.json').map(transform_business_data)
# business_df = rdd_to_pandas(business_rdd)
# business_df.to_csv('cache/business_df.csv', index=False)

## Process Train Data

In [2]:
folder_path = '../data/'
user_df = pd.read_csv('../well-trained/cache/user_df.csv')
business_df = pd.read_csv('../well-trained/cache/business_df.csv')
review_data = read_json_data(folder_path + '/review_train.json', extract_review_data, sc).collect()

train_file = '../yelp_train.csv'
val_file = '../yelp_val.csv'
test_file = '../yelp_true.csv'

                                                                                

In [4]:
feature_processor = FeatureProcessor(user_df, business_df, review_data)

In [5]:
#### Choose train data from below (yelp_train.csv | yelp_combined.csv)

In [6]:
def process_training_data(train_path):
    try:
        lines_train = sc.textFile(train_path)
        train_data = lines_train.filter(lambda row: not row.startswith('user_id')).map(lambda line: line.split(',')).collect()
        train_df = pd.DataFrame(train_data, columns=['user_id', 'business_id', 'stars'])
        train_df['stars'] = train_df['stars'].astype(float)

        processed_df = feature_processor.process_all_features(train_df)

        X = processed_df.drop(['stars', 'user_id', 'business_id'], axis=1)
        y = processed_df['stars']

        return X, y
    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None



In [7]:
X_train, y_train = process_training_data(folder_path + train_file)
# X_val, y_val = process_training_data(val_file)

In [8]:
# cheating here
X_val, y_val = process_training_data('../yelp_true.csv')

## Train, Cross Validation and Grid Search

In [39]:
results = []

In [8]:
# model = XGBRegressor(max_depth=7, learning_rate=0.06, n_estimators=300, max_leaves=120)
# model.fit(X_train, y_train)

In [40]:
# ## GRID SEARCH

# import numpy as np
# import xgboost as xgb
# from sklearn.model_selection import GridSearchCV, train_test_split
# from sklearn.metrics import mean_squared_error
# from math import sqrt

# param_grid = {
#     'max_depth': [5, 6, 7],
#     'learning_rate': [0.04, 0.05, 0.06],
#     'n_estimators': [450, 500, 550],
#     'max_leaves': [90, 100, 110],
#     'subsample': [0.7, 0.8, 0.9],
#     'colsample_bytree': [0.7, 0.8, 0.9],
#     'gamma': [0, 0.1, 0.2]
# }

# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)
# grid_search.fit(X_train, y_train)

# print("Best parameters:", grid_search.best_params_)
# print("Best score (MSE):", -grid_search.best_score_)

# if (-grid_search.best_score_ < min_mse):
#     min_mse = -grid_search.best_score_
# else:
#     print("no better\n")

# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_val)
# rmse = sqrt(mean_squared_error(y_val, y_pred))
# print("RMSE on true set:", rmse)

# results.append({
#     'param_grid': param_grid,
#     'best_params': grid_search.best_params_,
#     'best_score': np.sqrt(-grid_search.best_score_),
#     'rmse': rmse
# })

Fitting 3 folds for each of 2187 candidates, totalling 6561 fits
Best parameters: {'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.04, 'max_depth': 7, 'max_leaves': 90, 'n_estimators': 550, 'subsample': 0.9}
Best score (MSE): 0.9669199851888995
RMSE on val set: 0.9780264465556459


In [17]:
# import numpy as np
# import xgboost as xgb
# from sklearn.model_selection import RandomizedSearchCV, train_test_split
# from sklearn.metrics import mean_squared_error

# def find_satisfactory_model(rmse_threshold, param_variations, max_attempts=20):
#     attempts = 0
#     found = False
    
#     # Base parameter setup
#     base_params = {
#         'max_depth': [3, 4, 5, 6, 7, 8],
#         'learning_rate': np.linspace(0.01, 0.1, 10),
#         'n_estimators': [100, 200, 300, 400, 500],
#         'subsample': np.linspace(0.5, 1.0, 6),
#         'colsample_bytree': np.linspace(0.5, 1.0, 6),
#         'gamma': [0, 0.1, 0.2, 0.3, 0.4],
#         'min_child_weight': [1, 2, 3, 4, 5]
#     }

#     while not found and attempts < max_attempts:
#         for variation in param_variations:
#             current_params = {**base_params, **variation}
#             print(f"Attempt #{attempts+1} with params: {current_params}")
#             model = xgb.XGBRegressor()
#             random_search = RandomizedSearchCV(
#                 estimator=model,
#                 param_distributions=current_params,
#                 n_iter=50,  # Adjust as needed
#                 scoring='neg_mean_squared_error',
#                 cv=3,
#                 verbose=2,
#                 random_state=42,
#                 n_jobs=-1
#             )

#             random_search.fit(X_train, y_train)
#             best_params = random_search.best_params_
#             best_score = np.sqrt(-random_search.best_score_)
#             y_pred = random_search.best_estimator_.predict(X_val)
#             rmse = np.sqrt(mean_squared_error(y_val, y_pred))

#             print(f"RMSE: {rmse}, Best Params: {best_params}")
#             if rmse < rmse_threshold:
#                 print(f"Found satisfactory model with RMSE: {rmse}, Params: {best_params}")
#                 return random_search.best_estimator_
#             attempts += 1
#             if attempts >= max_attempts:
#                 break

#     print("No satisfactory model found after maximum attempts. Consider adjusting parameters or threshold.")
#     return None

# # Parameter variations
# param_variations = [
#     {'subsample': [0.7, 0.8, 0.9], 'colsample_bytree': [0.7, 0.8]},
#     {'gamma': [0, 0.1, 0.2]}
# ]

# # Example call
# rmse_threshold = 0.975
# best_model = find_satisfactory_model(rmse_threshold, param_variations)

Attempt #1 with params: {'max_depth': [3, 4, 5, 6, 7, 8], 'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ]), 'n_estimators': [100, 200, 300, 400, 500], 'subsample': [0.7, 0.8, 0.9], 'colsample_bytree': [0.7, 0.8], 'gamma': [0, 0.1, 0.2, 0.3, 0.4], 'min_child_weight': [1, 2, 3, 4, 5]}
Fitting 3 folds for each of 50 candidates, totalling 150 fits
RMSE: 0.9782157268710036, Best Params: {'subsample': 0.9, 'n_estimators': 400, 'min_child_weight': 2, 'max_depth': 6, 'learning_rate': 0.06000000000000001, 'gamma': 0.4, 'colsample_bytree': 0.8}
Attempt #2 with params: {'max_depth': [3, 4, 5, 6, 7, 8], 'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ]), 'n_estimators': [100, 200, 300, 400, 500], 'subsample': array([0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), 'colsample_bytree': array([0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), 'gamma': [0, 0.1, 0.2], 'min_child_weight': [1, 2, 3, 4, 5]}
Fitting 3 folds for each of 50 candidates, totalling 150 fits
RM



RMSE: 0.9782157268710036, Best Params: {'subsample': 0.9, 'n_estimators': 400, 'min_child_weight': 2, 'max_depth': 6, 'learning_rate': 0.06000000000000001, 'gamma': 0.4, 'colsample_bytree': 0.8}
Attempt #4 with params: {'max_depth': [3, 4, 5, 6, 7, 8], 'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ]), 'n_estimators': [100, 200, 300, 400, 500], 'subsample': array([0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), 'colsample_bytree': array([0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), 'gamma': [0, 0.1, 0.2], 'min_child_weight': [1, 2, 3, 4, 5]}
Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.020000000000000004, max_depth=8, min_child_weight=3, n_estimators=200, subsample=0.9; total time=  11.4s
[CV] END colsample_bytree=0.7, gamma=0.3, learning_rate=0.08, max_depth=5, min_child_weight=4, n_estimators=200, subsample=0.9; total time=   6.2s
[CV] END colsample_bytree=0.7, gamma=0.4, learning_rate=0.09000000000000001,



RMSE: 0.9782157268710036, Best Params: {'subsample': 0.9, 'n_estimators': 400, 'min_child_weight': 2, 'max_depth': 6, 'learning_rate': 0.06000000000000001, 'gamma': 0.4, 'colsample_bytree': 0.8}
Attempt #8 with params: {'max_depth': [3, 4, 5, 6, 7, 8], 'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ]), 'n_estimators': [100, 200, 300, 400, 500], 'subsample': array([0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), 'colsample_bytree': array([0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), 'gamma': [0, 0.1, 0.2], 'min_child_weight': [1, 2, 3, 4, 5]}
Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END colsample_bytree=1.0, gamma=0.2, learning_rate=0.08, max_depth=6, min_child_weight=1, n_estimators=300, subsample=0.8; total time=   9.0s
[CV] END colsample_bytree=1.0, gamma=0.2, learning_rate=0.05000000000000001, max_depth=3, min_child_weight=5, n_estimators=200, subsample=0.9; total time=   4.0s
[CV] END colsample_bytree=1.0, gamma=0.1, learning_rate=0.07, max_depth=3, 



RMSE: 0.9782157268710036, Best Params: {'subsample': 0.9, 'n_estimators': 400, 'min_child_weight': 2, 'max_depth': 6, 'learning_rate': 0.06000000000000001, 'gamma': 0.4, 'colsample_bytree': 0.8}
Attempt #12 with params: {'max_depth': [3, 4, 5, 6, 7, 8], 'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ]), 'n_estimators': [100, 200, 300, 400, 500], 'subsample': array([0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), 'colsample_bytree': array([0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), 'gamma': [0, 0.1, 0.2], 'min_child_weight': [1, 2, 3, 4, 5]}
Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END colsample_bytree=0.9, gamma=0.1, learning_rate=0.04000000000000001, max_depth=6, min_child_weight=1, n_estimators=500, subsample=0.5; total time=  13.0s
[CV] END colsample_bytree=0.8, gamma=0.4, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=100, subsample=0.9; total time=   4.0s
[CV] END colsample_bytree=0.7, gamma=0.2, learning_rate=0.06000000000000001



[CV] END colsample_bytree=0.6, gamma=0.1, learning_rate=0.07, max_depth=4, min_child_weight=2, n_estimators=500, subsample=0.7; total time=  10.8s
[CV] END colsample_bytree=1.0, gamma=0.2, learning_rate=0.030000000000000006, max_depth=4, min_child_weight=1, n_estimators=400, subsample=0.6; total time=   9.7s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.020000000000000004, max_depth=8, min_child_weight=3, n_estimators=200, subsample=0.9; total time=  12.9s
[CV] END colsample_bytree=0.7, gamma=0.3, learning_rate=0.08, max_depth=5, min_child_weight=4, n_estimators=200, subsample=0.9; total time=   5.7s
[CV] END colsample_bytree=0.7, gamma=0.4, learning_rate=0.09000000000000001, max_depth=7, min_child_weight=5, n_estimators=100, subsample=0.9; total time=   5.3s
[CV] END colsample_bytree=0.8, gamma=0.4, learning_rate=0.020000000000000004, max_depth=4, min_child_weight=5, n_estimators=200, subsample=0.9; total time=   5.6s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.0

In [18]:
# best_model

[CV] END colsample_bytree=0.7, gamma=0.4, learning_rate=0.030000000000000006, max_depth=3, min_child_weight=3, n_estimators=500, subsample=0.7; total time=  10.2s
[CV] END colsample_bytree=0.7, gamma=0.4, learning_rate=0.030000000000000006, max_depth=5, min_child_weight=5, n_estimators=300, subsample=0.9; total time=   9.2s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=500, subsample=0.8; total time=  11.0s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.08, max_depth=8, min_child_weight=3, n_estimators=100, subsample=0.8; total time=   6.1s
[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.09000000000000001, max_depth=4, min_child_weight=5, n_estimators=200, subsample=0.9; total time=   5.5s
[CV] END colsample_bytree=0.7, gamma=0.4, learning_rate=0.09000000000000001, max_depth=5, min_child_weight=2, n_estimators=500, subsample=0.7; total time=  12.0s
[CV] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.0

In [9]:
import numpy as np
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, randint
from joblib import parallel_backend


# 定义模型和参数范围
estimators = [
    ('xgb', xgb.XGBRegressor(missing=np.inf)),
    ('rf', RandomForestRegressor()),
    ('svr', SVR(C=1, gamma='scale'))
]
params = {
    'xgb__n_estimators': randint(50, 300),
    'xgb__max_depth': randint(3, 7),
    'xgb__learning_rate': uniform(0.01, 0.06),
    'xgb__subsample': uniform(0.5, 0.4),
    'xgb__colsample_bytree': uniform(0.5, 0.4),
    'xgb__gamma': uniform(0, 0.5),
    'rf__n_estimators': randint(100, 300),
    'rf__max_features': [None, 'log2', 'sqrt'],
    'svr__C': uniform(0.1, 10),
    'svr__gamma': ['scale', 'auto'],
    'final_estimator__alpha': [0.1, 1, 10, 100, 1000]
}

# 设置元模型
final_estimator = Ridge()

# 创建堆叠回归器
stacked_model = StackingRegressor(estimators=estimators, final_estimator=final_estimator)

# 配置随机搜索
random_search = RandomizedSearchCV(estimator=stacked_model, param_distributions=params, 
                                   n_iter=3, scoring='neg_mean_squared_error', 
                                   cv=1, verbose=2, random_state=42, n_jobs=5)

# 执行搜索
with parallel_backend('threading', n_jobs=6):
    random_search.fit(X_train, y_train)

# 最佳参数和评分
best_params = random_search.best_params_
best_score = np.sqrt(-random_search.best_score_)
y_pred = random_search.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print("Best Parameters: ", best_params)
print("Best Validation RMSE: ", best_score)
print("Test RMSE: ", rmse)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] END final_estimator__alpha=100, rf__max_features=None, rf__n_estimators=114, svr__C=7.41993941811405, svr__gamma=scale, xgb__colsample_bytree=0.7387400631785948, xgb__gamma=0.22291637642679557, xgb__learning_rate=0.015998494949080174, xgb__max_depth=5, xgb__n_estimators=137, xgb__subsample=0.6334834444556088; total time=410.9min
[CV] END final_estimator__alpha=100, rf__max_features=None, rf__n_estimators=114, svr__C=7.41993941811405, svr__gamma=scale, xgb__colsample_bytree=0.7387400631785948, xgb__gamma=0.22291637642679557, xgb__learning_rate=0.015998494949080174, xgb__max_depth=5, xgb__n_estimators=137, xgb__subsample=0.6334834444556088; total time=470.6min
[CV] END final_estimator__alpha=100, rf__max_features=None, rf__n_estimators=114, svr__C=7.41993941811405, svr__gamma=scale, xgb__colsample_bytree=0.7387400631785948, xgb__gamma=0.22291637642679557, xgb__learning_rate=0.015998494949080174, xgb__max_depth=5, xgb__n_esti

KeyboardInterrupt: 

log:
GRID SEARCH
'max_depth': [6, 7, 8],
'learning_rate': [0.05, 0.06, 0.07],
'n_estimators': [100, 300, 500],
'max_leaves': [100, 120, 140],

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters: {'learning_rate': 0.05, 'max_depth': 6, 'max_leaves': 100, 'n_estimators': 500}
Best score (MSE): 0.9676720757696518
Duration: 419 seconds

Fitting 3 folds for each of 2187 candidates, totalling 6561 fits
Best parameters: {'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.04, 'max_depth': 7, 'max_leaves': 90, 'n_estimators': 550, 'subsample': 0.9}
Duration: 2567 seconds

In [42]:
## save and output to model file
xgb_file = 'test_xgb_grid_search_huge_09750'
with open(xgb_file, "wb") as model_file:
        pickle.dump(best_model, model_file)

## Predict

In [4]:
xgb_file = 'test_xgb_grid_search_huge_09750'
with open(xgb_file, "rb") as model_file:
            model = pickle.load(model_file)

model

In [23]:
test_path = '../yelp_val_in.csv'
# test_path = '../yelp_true_in.csv'
pred_path = 'xgb_log.csv'

test_df = pd.read_csv(test_path)
# test_df = pd.read_csv(test_path)
print("DONE")
# val_path, test_path, pred_path

DONE


In [24]:
# Process test data
processed_test_df = feature_processor.process_all_features(test_df)
X_test = processed_test_df.drop(['user_id', 'business_id', 'stars'], axis=1)

In [25]:
# Predict and save the result
XGB_model_predictions = model.predict(X_test)
test_df['prediction'] = XGB_model_predictions
test_df.to_csv(pred_path, index=False)

## Result

In [27]:
eval_path = '../yelp_val.csv'

In [28]:
from sklearn.metrics import mean_squared_error
from math import sqrt
df_predictions = pd.read_csv(pred_path)
df_predictions = df_predictions.drop(['stars'], axis=1)
df_eval = pd.read_csv(eval_path)

In [31]:
# Base Model
df_merged = pd.merge(df_predictions, df_eval, on=["user_id", "business_id"])
eval_rmse = sqrt(mean_squared_error(df_merged['stars'], df_merged['prediction']))
print("xgb Test RMSE:", eval_rmse)

xgb Test RMSE: 0.9794887500898913


In [50]:
df_merged = pd.merge(df_predictions, df_eval, on=["user_id", "business_id"])
eval_rmse = sqrt(mean_squared_error(df_merged['stars'], df_merged['prediction']))
print("xgb Test RMSE:", eval_rmse)

xgb Test RMSE: 0.9780264461379543


## FURTHER EVALUATION

In [32]:
# Further evaluate on which kind of users & business does the RMSE perform better
## guess: cold start, few reviews, ...
df_merged

Unnamed: 0,user_id,business_id,prediction,stars
0,wf1GqnKQuvH-V3QN80UOOQ,fThrN4tfupIGetkrz18JOg,3.840040,5.0
1,39FT2Ui8KUXwmUt6hnwy-g,uW6UHfONAmm8QttPkbMewQ,4.802097,5.0
2,7weuSPSSqYLUFga6IYP4pg,IhNASEZ3XnBHmuuVnWdIwA,4.820791,4.0
3,CqaIzLiWaa-lMFYBAsYQxw,G859H6xfAmVLxbzQgipuoA,4.764004,5.0
4,yy7shAsNWRbGg-8Y67Dzag,rS39YnrhoXmPqHLzCBjeqw,2.903287,3.0
...,...,...,...,...
142039,pA9NXgASl86RImkdBtydrA,q6-SF8zHFU1AWO70k92o1Q,3.188259,2.0
142040,_eUb7UGsUoSfi9n2ieF5ow,hgWMxKhrnOUd3m5nOUBIkA,2.946222,4.0
142041,cEJGXB63KhROA-XmE_jgXw,0ldxjei8v4q95fApIei3Lg,3.597851,5.0
142042,Z4-V0hc51oxUdULWJOufeg,j29tuUdrfaxmGjwxHdHZPA,4.060645,3.0


In [34]:
df_merged.to_csv('best_df_merged_09780.csv', index=False)

In [71]:
## DISTRIBUTION 5,4,3,2,1