In [1]:
from datetime import datetime

from scipy.stats import skew  
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor

import numpy as np
import pandas as pd

import umap.umap_ as umap
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import StackingRegressor

from sklearn.preprocessing import StandardScaler
import category_encoders as ce
from sklearn.feature_extraction.text import TfidfVectorizer

import hdbscan
import seaborn as sns

2025-07-08 08:10:31.083737: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751929831.099140  117604 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751929831.103615  117604 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1751929831.114951  117604 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1751929831.114967  117604 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1751929831.114968  117604 computation_placer.cc:177] computation placer alr

In [None]:
train = pd.read_csv('house_price/dataset.csv')
test = pd.read_csv('house_price/test.csv')
print("Train set size:", train.shape)
print("Test set size:", test.shape)
print('START data processing', datetime.now(), )

train_ID = train['id']
test_ID = test['id']
# Now drop the  'Id' colum since it's unnecessary for  the prediction process.
train.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

# Deleting outliers
train.reset_index(drop=True, inplace=True)

# We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
train["sale_price"] = np.log1p(train["sale_price"])
y = train.sale_price.reset_index(drop=True)
train_features = train.drop(['sale_price'], axis=1)
test_features = test

features = pd.concat([train_features, test_features]).reset_index(drop=True)
print(features.shape)
# Some of the non-numeric predictors are stored as numbers; we convert them into strings 


# === 缺失值处理 === #
def preprocess_and_encode(df, y=None, drop_high_card=True):
    df = df.copy()
    
    # 合并城市 + subdivision 做原始地址字符串
    addresses = df['city'].fillna('') + ' ' + df['subdivision'].fillna('').str.lower()

    # TF-IDF 特征（可以控制 max_features 降维）
    vectorizer = TfidfVectorizer(max_features=300, stop_words='english')
    address_vecs = vectorizer.fit_transform(addresses)
    
    # 降维
    umap_vecs = umap.UMAP(n_components=2, random_state=42).fit_transform(address_vecs)
    
    # 聚类
    kmeans = KMeans(n_clusters=30, random_state=42)
    labels = kmeans.fit_predict(umap_vecs)
    
    # 加入 DataFrame 可视化
    df['addr_cluster'] = labels.astype(str)

    
    fr = pd.DataFrame({'umap_vecs1': umap_vecs[:,0], 'umap_vecs2': umap_vecs[:, 1], 'cluster': kmeans.labels_})
    sns.lmplot(data=fr, x='umap_vecs1', y='umap_vecs2', hue='cluster', fit_reg=False)




    # 地址字符串（city + subdivision）
    addresses = (df['city'].fillna('') + ' ' + df['subdivision'].fillna('')).str.lower()
    
    # 向量化
    vec = TfidfVectorizer(max_features=500, stop_words='english')
    X_text = vec.fit_transform(addresses)
    
    # 降维
    X_umap = umap.UMAP(
        n_neighbors=10,       
        min_dist=0.1,          
        n_components=5,       
        metric='cosine',        
        random_state=42
    ).fit_transform(X_text)
    
    # 聚类
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=20,       
        min_samples=5,           
        cluster_selection_epsilon=0.1
    )
    
    labels = clusterer.fit_predict(X_umap)
    
    df['addr_cluster222'] = labels.astype(str)     

    
    # ================= 清洗阶段 ================= #
    # 补充缺失类别为 'None'
    cat_cols = df.select_dtypes(include='object').columns
    df[cat_cols] = df[cat_cols].fillna('None')

    # 填充数值缺失为 0
    num_cols = df.select_dtypes(include=[np.number]).columns
    df[num_cols] = df[num_cols].fillna(0)

    # 拆解日期特征
    if 'sale_date' in df.columns:
        df['sale_year'] = pd.to_datetime(df['sale_date']).dt.year
        df['sale_month'] = pd.to_datetime(df['sale_date']).dt.month
        df = df.drop(columns='sale_date')

    # subdivision 用 target encoding 处理
    if 'subdivision' in df.columns and y is not None:
        encoder_sub = ce.TargetEncoder()
        df['subdivision_encoded'] = encoder_sub.fit_transform(df['subdivision'], y)
        df = df.drop(columns='subdivision')

    # ================= 编码阶段 ================= #
    cat_cols = df.select_dtypes(include='object').columns
    low_card_cols = [col for col in cat_cols if df[col].nunique() <= 50]
    high_card_cols = [col for col in cat_cols if df[col].nunique() > 50]

    # One-hot 编码
    X_cat = pd.get_dummies(df[low_card_cols], dummy_na=True)

    # Target 编码
    if y is not None and high_card_cols:
        encoder = ce.TargetEncoder()
        X_target = encoder.fit_transform(df[high_card_cols], y)
    else:
        X_target = pd.DataFrame(index=df.index)

    # 数值标准化
    num_cols = df.select_dtypes(include=[np.number]).columns
    scaler = StandardScaler()
    X_num_scaled = pd.DataFrame(scaler.fit_transform(df[num_cols]), columns=num_cols, index=df.index)

    # 拼接所有
    X_final_df = pd.concat([X_num_scaled, X_cat, X_target], axis=1)

    return X_final_df

# 应用预处理
X_final = preprocess_and_encode(features)

Train set size: (200000, 47)
Test set size: (200000, 46)
START data processing 2025-07-08 08:10:36.000141
(400000, 45)


  warn(


In [None]:
X_final.shape

In [None]:
X = X_final.iloc[:len(train_features)].reset_index(drop=True)
X_sub = X_final.iloc[len(train_features):].reset_index(drop=True)


In [None]:
print('X', X.shape, 'y', y.shape, 'X_sub', X_sub.shape)

In [None]:
# outliers = [30, 88, 462, 631, 1322]
# X = X.drop(X.index[outliers])
# y = y.drop(y.index[outliers])


protected_cols = ['addr_cluster', 'addr_cluster222']
overfit = [i for i in X.columns if i not in protected_cols and X[i].value_counts().iloc[0] / len(X) * 100 > 99.94]


print(overfit)

# X = X.drop(overfit, axis=1, errors='ignore')
# X_sub = X_sub.drop(overfit, axis=1, errors='ignore')
X = X.drop(overfit, axis=1)
X_sub = X_sub.drop(overfit, axis=1)

print('X', X.shape, 'y', y.shape, 'X_sub', X_sub.shape)

# ################## ML ########################################
print('START ML', datetime.now(), )

kfolds = KFold(n_splits=10, shuffle=True, random_state=42)


# rmsle
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))


# build our model scoring function
def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X, y,
                                    scoring="neg_mean_squared_error",
                                    cv=kfolds))
    return (rmse)


# setup models    
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

ridge = make_pipeline(RobustScaler(),
                      RidgeCV(alphas=alphas_alt, cv=kfolds))

lasso = make_pipeline(RobustScaler(),
                      LassoCV(max_iter=int(5000), alphas=alphas2,
                              random_state=42, cv=kfolds))

elasticnet = make_pipeline(RobustScaler(),
                           ElasticNetCV(max_iter=int(5000), alphas=e_alphas,
                                        cv=kfolds, l1_ratio=e_l1ratio))
                                        
svr = make_pipeline(RobustScaler(),
                      SVR(C=1.0, epsilon=0.1, gamma='scale'))


gbr = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =42)
                                   

#lightgbm = LGBMRegressor(objective='regression', n_jobs=-1,
#                                       num_leaves=4,
#                                       learning_rate=0.01, 
#                                       n_estimators=5000,
#                                       max_bin=200, 
#                                       bagging_fraction=0.75,
#                                       bagging_freq=5, 
#                                       bagging_seed=7,
#                                       feature_fraction=0.2,
#                                       feature_fraction_seed=7,
#                                       verbose=-1,
                                       #min_data_in_leaf=2,
                                       #min_sum_hessian_in_leaf=11
#                                       )

lightgbm = LGBMRegressor(
    objective='regression',
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=15,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
                                       

xgboost = XGBRegressor(
    learning_rate=0.05, 
    n_estimators=1000,   
    max_depth=5, min_child_weight=1,
    gamma=0, subsample=0.7,
    colsample_bytree=0.7,
    objective='reg:squarederror', n_jobs=-1,
    reg_alpha=0.0001, seed=42
)



                                

print('TEST score on CV')

#score = cv_rmse(ridge)
#print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

#score = cv_rmse(lasso)
#print("Lasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

#score = cv_rmse(elasticnet)
#print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

#score = cv_rmse(svr)
#print("SVR score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(lightgbm)
print("Lightgbm score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(gbr)
print("GradientBoosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(xgboost)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

print('finished')

In [None]:
print('START Fit')
#print(datetime.now(), 'StackingRegressor')
#stack_gen_model = stack_gen.fit(X, y)
print(datetime.now(), 'elasticnet')
elastic_model_full_data = elasticnet.fit(X, y)
print(datetime.now(), 'lasso')
lasso_model_full_data = lasso.fit(X, y)
print(datetime.now(), 'ridge')
ridge_model_full_data = ridge.fit(X, y)
print(datetime.now(), 'svr')
svr_model_full_data = svr.fit(X, y)
print(datetime.now(), 'GradientBoosting')
gbr_model_full_data = gbr.fit(X, y)
print(datetime.now(), 'xgboost')
xgb_model_full_data = xgboost.fit(X, y)
print(datetime.now(), 'lightgbm')
lgb_model_full_data = lightgbm.fit(X, y)