## Import important libraries

In [55]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import os
import warnings
from scipy import stats
from sklearn.cluster import KMeans
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import KFold, cross_val_score
from sklearn.feature_selection import mutual_info_regression
from xgboost import XGBRegressor
from sklearn.decomposition import PCA
from category_encoders import MEstimateEncoder
import optuna
from lightgbm import LGBMRegressor
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings('ignore')

## Dataset information

In [None]:
# train path:
train_path = '../data/train.csv'
# test path:
test_path = '../data/test.csv'
# load train data:
train = pd.read_csv(train_path, index_col='Id')
test = pd.read_csv(test_path, index_col='Id')
print(train.head())
print('Train data contains {} rows and {} features'.format(train.shape[0], train.shape[1]))

print(test.head())
print('Test data contains {} rows and {} features'.format(test.shape[0], test.shape[1]))

### Missing values

In [None]:
# Concat test and train data:
all_data = pd.concat([train, test])
all_data.tail(10)

# Vsiualize the percentages of missing values per features:
missing_values = pd.DataFrame({'Percentage' : (all_data.isnull().sum()/len(all_data)) * 100}, 
                              index=all_data.columns)
missing_values = missing_values[missing_values['Percentage']>0].sort_values(
    by='Percentage', 
    ascending=False)
missing_values.drop(index='SalePrice', inplace=True)
missing_values.head(10)

### Categorical and continuous columns

In [None]:
continuous_features = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = train.select_dtypes(include=['object']).columns.tolist()
continuous_features.remove('SalePrice')
print('There are {} continuous features and {} categorical features'.format(len(continuous_features), len(categorical_features)))
print('Continuous features: ', continuous_features)
print('Categorical features: ', categorical_features)   

In [None]:
# Calculating correlations :
train_corr = train.select_dtypes(include=[np.number])
# Calculating correlations:
corr = train_corr.corr()
# Sorting correlations with SalePrice:
corr_sorted = corr['SalePrice'].abs().sort_values(ascending=False)
corr_high = corr_sorted[corr_sorted>0.5].index
# Subsetting correlations:
corr = corr.loc[corr_high, corr_high]
# Visualize the heatmap:
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', square=True)
plt.title('Correlation Matrix')
plt.show()

## Baseline
* preserve all the features
* fill NA with median value
* single XGBoost model

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


train = pd.read_csv('../data/train.csv')
test  = pd.read_csv('./data/test.csv')

test_id = test['Id']

train.drop(['Id'], axis=1, inplace=True)

continuous_columns = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
discret_columns = train.select_dtypes(include=['object']).columns.tolist()

for col in discret_columns:
    train[col].fillna(-1,inplace=True)
    test[col].fillna(-1,inplace=True)
    res1 = train[col].value_counts().keys()
    res2 = test[col].value_counts().keys()
    res = list(set(res1).union(set(res2)))
    mapping = dict(zip(res, range(len(res))))
    print(f'{col}:{mapping}')
    train[col] = train[col].map(mapping)
    test[col] = test[col].map(mapping)

train.to_csv('../data/train_pro.csv',index=False)
test.to_csv('../data/test_pro.csv',index=False)


## 划分训练集和验证集

In [61]:
def split_train_val(X, y, ratio=0.8, seed=42):
    """
    划分训练集和验证集
    :param X: 特征
    :param y: 标签
    :param ratio: 训练集比例
    :return: X_train, y_train, X_val, y_val
    """
    np.random.seed(seed)
    n = X.shape[0]
    y = y.astype(int)
    indices = np.arange(n)
    np.random.shuffle(indices)
    X, y = X[indices], y[indices]
    split = int(n * ratio)
    X_train, y_train = X[:split], y[:split]
    X_val, y_val = X[split:], y[split:]
    return X_train, y_train, X_val, y_val


### 训练

In [None]:
config={
    'ratio':0.8,
    'seed':42
}
df = pd.read_csv('../data/train_pro.csv')
data = df.values
n = data.shape[0]
X = data[:,:-1]
y = data[:,-1]

X_train, y_train, X_val, y_val = split_train_val(X, y, config.get('ratio'), seed=config.get('seed'))
#choose the best model
xgb_model = XGBRegressor(learning_rate=0.015,n_estimators=4750,max_depth=3,min_child_weight=0,subsample=0.7,colsample_bytree=0.4064,nthread=-1,scale_pos_weight=2,seed=42)
xgb_model.fit(X_train,y_train, eval_set = [(X_val,y_val)])


df_test = pd.read_csv('../data/test_pro.csv')
data_test = df_test.values
X_test = data_test[:,1:]
pred = xgb_model.predict(X_test)
pred_train = xgb_model.predict(X_train)
submission = pd.DataFrame({
    'Id': data_test[:,0].astype(int),  # Assuming test set has an 'Id' column
    'SalePrice': pred # Replace 'Target' with the name of your target column
})

# Save the submission file
submission.to_csv('../res/submission.csv', index=False)

### 测试并提交

In [63]:

data_test = test.values
X_test = data_test[:,1:]
pred = xgb_model.predict(X_test)
pred_train = xgb_model.predict(X_train)
submission = pd.DataFrame({
    'Id': data_test[:,0].astype(int),  # Assuming test set has an 'Id' column
    'SalePrice': pred # Replace 'Target' with the name of your target column
})

# Save the submission file
submission.to_csv('../res/submission_baseline.csv', index=False)