In [7]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

train_data = pd.read_csv('C:/Users/HUAWEI/ML/task_4/train.csv')
test_data = pd.read_csv('C:/Users/HUAWEI/ML/task_4/test.csv')

train_data = train_data.drop(columns=["Id"])
val_ids = test_data["Id"]
test_data = test_data.drop(columns=["Id"])

for feature in train_data.columns:
    percent = train_data[feature].isnull().sum() / train_data.shape[0]
    if percent > 0.7:
        train_data = train_data.drop(columns=feature)
        test_data = test_data.drop(columns=feature)

for feature in train_data.columns[:-1]:
    if train_data[feature].dtype == 'object':
        train_data[feature] = LabelEncoder().fit_transform(train_data[feature])
        test_data[feature] = LabelEncoder().fit_transform(test_data[feature])

X_train, X_test, y_train, y_test = train_test_split(train_data.drop(columns='SalePrice').values,
                                                    np.log1p(train_data['SalePrice'].values), test_size=0.2,
                                                    random_state=98987)

X_train = SimpleImputer(strategy='most_frequent').fit_transform(X_train)
X_test = SimpleImputer(strategy='most_frequent').fit_transform(X_test)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)
X_test = scaler.transform(X_test)

# Random Forest

In [9]:
from sklearn.ensemble import RandomForestRegressor

parameters = { 'criterion': ['squared_error', 'absolute_error', 'poisson'],'n_estimators': [10, 50, 75, 100],'max_features': [1.0, 'sqrt', 'log2'],'min_samples_split': [2, 5, 9], 'max_depth': [50, 100, 150, 200, 250], }
rfr_model = GridSearchCV(RandomForestRegressor(), parameters)
rfr_model.fit(X_train, y_train)
print("Best parameters for RFR is: {}".format(rfr_model.best_params_))

Best parameters for RFR is: {'criterion': 'squared_error', 'max_depth': 150, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100}


In [11]:
rfr = RandomForestRegressor(criterion='squared_error', max_depth=150, max_features='sqrt', min_samples_split=2, n_estimators=100)
rfr.fit(X_train, y_train)
print('Error on train: ', metrics.mean_absolute_error(rfr.predict(X_train), y_train))
print('Error on test: ', metrics.mean_absolute_error(rfr.predict(X_test), y_test))

Error on train:  0.033981858556500016
Error on test:  0.10478053237521023


# xgboost

In [3]:
import xgboost as xgb

parameters = {"learning_rate": [0.1, 0.2, 0.3],"max_depth": [2, 3, 4, 5, 6],"min_child_weight": [1, 1.5, 2, 4],"n_estimators": [100, 300, 500, 700, 1000]}
xgb_model = GridSearchCV(xgb.XGBRegressor(), parameters)
xgb_model.fit(X_train, y_train)
print("Best parameters for XGB is: {}".format(xgb_model.best_params_))

Best parameters for XGB is: {'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 300}


In [12]:
xgbr = xgb.XGBRegressor(learning_rate=0.1, max_depth=3, min_child_weight=4, n_estimators=300)
xgbr.fit(X_train, y_train)
print('Error on train: ', metrics.mean_absolute_error(xgbr.predict(X_train), y_train))
print('Error on test: ', metrics.mean_absolute_error(xgbr.predict(X_test), y_test))

Error on train:  0.039256612684273856
Error on test:  0.0935616141941667


# lightgbm

In [14]:
import lightgbm as lgbm

parameters = {'num_leaves': [16, 32, 64, 128],'learning_rate': [0.005, 0.01, 0.02],'max_depth': [2, 4, 6, 8],'n_estimators': [500, 1000, 2000],'max_bins': [50, 100, 150], }
lgb_model = GridSearchCV(lgbm.LGBMRegressor(), parameters)
lgb_model.fit(X_train, y_train)
print("Best parameters for LGBM is: {}".format(lgb_model.best_params_))

Best parameters for LGBM is: {'learning_rate': 0.02, 'max_bins': 50, 'max_depth': 2, 'n_estimators': 2000, 'num_leaves': 16}


In [15]:
lgbmr = lgbm.LGBMRegressor(num_leaves=16, learning_rate=0.02, max_depth=2, n_estimators=2000, max_bins=50)
lgbmr.fit(X_train, y_train)
print('Error on train: ', metrics.mean_absolute_error(lgbmr.predict(X_train), y_train))
print('Error on test: ', metrics.mean_absolute_error(lgbmr.predict(X_test), y_test))

Error on train:  0.05602769974002308
Error on test:  0.09109983456299534
