In [227]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_columns = 100
%matplotlib inline 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [228]:
train_price = pd.read_pickle("train_price.pkl")
train_data = pd.read_csv('train_data.csv', sep=';')
test_data = pd.read_csv('test_data.csv', sep=';')

In [229]:
train_data_1 = train_data.copy()
test_data_1 = test_data.copy()

In [230]:
train_data.drop(['Id', 'Ecoligy_2_A','Ecoligy_2_B', 'Ecoligy_3_A', 'Ecoligy_3_B', 'Shops_2_A', 'Shops_2_B'], axis=1, inplace=True)
test_data.drop(['Id', 'Ecoligy_2_A','Ecoligy_2_B', 'Ecoligy_3_A', 'Ecoligy_3_B', 'Shops_2_A', 'Shops_2_B'], axis=1, inplace=True)
neworder = ['Square','Social_2', 'Social_3', 'Social_1', 'Rooms', 'DistrictId', 'Ecology_1', 'LifeSquare', 'HouseYear', 'Healthcare_1',  'KitchenSquare', 'HouseFloor', 'Shops_1', 'Floor', 'Helthcare_2']
train_data=train_data.reindex(columns=neworder)
test_data=test_data.reindex(columns=neworder)

In [231]:
scaler = StandardScaler(with_mean=False)
train_data_scaled = pd.DataFrame(scaler.fit_transform(train_data), columns=train_data.columns)
test_data_scaled = pd.DataFrame(scaler.transform(test_data), columns=test_data.columns)

In [232]:
#pca = PCA(random_state=42) 
#pca.fit(train_data_scaled)
#pca.explained_variance_ratio_[:10].sum()

In [233]:
pca = PCA(n_components = 10, random_state=42)
x_train_mc = pca.fit_transform(train_data_scaled)
x_test_mc = pca.transform(test_data_scaled)

In [234]:
train_data_scaled['componenta_1'] = x_train_mc[:,0]
train_data_scaled['componenta_2'] = x_train_mc[:,1]
train_data_scaled['componenta_3'] = x_train_mc[:,2]
train_data_scaled['componenta_4'] = x_train_mc[:,3]
train_data_scaled['componenta_5'] = x_train_mc[:,4]
train_data_scaled['componenta_6'] = x_train_mc[:,5]
train_data_scaled['componenta_7'] = x_train_mc[:,6]
train_data_scaled['componenta_8'] = x_train_mc[:,7]
train_data_scaled['componenta_9'] = x_train_mc[:,8]
train_data_scaled['componenta_10'] = x_train_mc[:,9]
train_data_scaled.head(2)

Unnamed: 0,Square,Social_2,Social_3,Social_1,Rooms,DistrictId,Ecology_1,LifeSquare,HouseYear,Healthcare_1,KitchenSquare,HouseFloor,Shops_1,Floor,Helthcare_2,componenta_1,componenta_2,componenta_3,componenta_4,componenta_5,componenta_6,componenta_7,componenta_8,componenta_9,componenta_10
0,2.276125,1.990716,0.209814,1.882301,2.460594,0.803021,0.748112,1.655706,106.949278,1.205424,1.455989,1.476933,2.288758,1.338979,0.0,0.854653,-0.556102,0.239288,0.137338,-1.063266,0.01779,-0.371099,0.154269,-1.025431,-0.98566
1,3.115867,2.573005,0.041963,2.623813,3.690891,0.940682,0.000588,2.252176,107.438127,0.321446,1.941319,1.476933,3.329102,1.338979,0.669556,2.050155,1.247193,0.113488,-0.093348,-2.024671,0.429341,-0.313009,0.774797,-0.91918,-0.692464


In [235]:
test_data_scaled['componenta_1'] = x_test_mc[:,0]
test_data_scaled['componenta_2'] = x_test_mc[:,1]
test_data_scaled['componenta_3'] = x_test_mc[:,2]
test_data_scaled['componenta_4'] = x_test_mc[:,3]
test_data_scaled['componenta_5'] = x_test_mc[:,4]
test_data_scaled['componenta_6'] = x_test_mc[:,5]
test_data_scaled['componenta_7'] = x_test_mc[:,6]
test_data_scaled['componenta_8'] = x_test_mc[:,7]
test_data_scaled['componenta_9'] = x_test_mc[:,8]
test_data_scaled['componenta_10'] = x_test_mc[:,9]
test_data_scaled.head(2)

Unnamed: 0,Square,Social_2,Social_3,Social_1,Rooms,DistrictId,Ecology_1,LifeSquare,HouseYear,Healthcare_1,KitchenSquare,HouseFloor,Shops_1,Floor,Helthcare_2,componenta_1,componenta_2,componenta_3,componenta_4,componenta_5,componenta_6,componenta_7,componenta_8,componenta_9,componenta_10
0,2.366307,0.685868,0.041963,0.627434,2.460594,1.33072,2.606292,1.880084,107.112227,1.205424,1.455989,2.297452,0.0,1.147696,0.0,-1.298379,-0.731296,0.642669,-0.795717,0.646583,-1.201339,0.394351,0.263142,-0.600929,-0.319049
1,3.285671,0.358658,0.125888,0.342236,2.460594,1.697816,0.636693,2.336999,107.38381,1.205424,0.242665,0.984622,0.416138,0.191283,0.0,-1.489073,-0.331203,2.212153,0.305256,-0.508544,0.287521,0.364161,0.382605,0.603449,-0.533417


In [236]:
neworder_1 = ['Square','componenta_9', 'LifeSquare','componenta_6', 'componenta_7', 'componenta_5', 'componenta_3', 'HouseYear', 'Rooms', 'HouseFloor', 'Floor','DistrictId', 'Ecology_1', 'Social_2', 'KitchenSquare', 'Social_3', 'Healthcare_1', 'Social_1', 'Shops_1', 'Helthcare_2']
train_data_scaled=train_data_scaled.reindex(columns=neworder_1)
test_data_scaled=test_data_scaled.reindex(columns=neworder_1)

In [237]:
test_data_scaled.head(2)

Unnamed: 0,Square,componenta_9,LifeSquare,componenta_6,componenta_7,componenta_5,componenta_3,HouseYear,Rooms,HouseFloor,Floor,DistrictId,Ecology_1,Social_2,KitchenSquare,Social_3,Healthcare_1,Social_1,Shops_1,Helthcare_2
0,2.366307,-0.600929,1.880084,-1.201339,0.394351,0.646583,0.642669,107.112227,2.460594,2.297452,1.147696,1.33072,2.606292,0.685868,1.455989,0.041963,1.205424,0.627434,0.0,0.0
1,3.285671,0.603449,2.336999,0.287521,0.364161,-0.508544,2.212153,107.38381,2.460594,0.984622,0.191283,1.697816,0.636693,0.358658,0.242665,0.125888,1.205424,0.342236,0.416138,0.0


In [238]:
train_data_scaled.drop(['Healthcare_1'], axis=1, inplace=True)
test_data_scaled.drop(['Healthcare_1'], axis=1, inplace=True)

In [239]:
train_data_scaled.head(2)

Unnamed: 0,Square,componenta_9,LifeSquare,componenta_6,componenta_7,componenta_5,componenta_3,HouseYear,Rooms,HouseFloor,Floor,DistrictId,Ecology_1,Social_2,KitchenSquare,Social_3,Social_1,Shops_1,Helthcare_2
0,2.276125,-1.025431,1.655706,0.01779,-0.371099,-1.063266,0.239288,106.949278,2.460594,1.476933,1.338979,0.803021,0.748112,1.990716,1.455989,0.209814,1.882301,2.288758,0.0
1,3.115867,-0.91918,2.252176,0.429341,-0.313009,-2.024671,0.113488,107.438127,3.690891,1.476933,1.338979,0.940682,0.000588,2.573005,1.941319,0.041963,2.623813,3.329102,0.669556


In [240]:
#pca = PCA(random_state=42) 
#pca.fit(train_data_scaled_var3)
#pca.explained_variance_ratio_[:2].sum()

In [241]:
train_data['componenta_9'] = x_train_mc[:,8]
train_data['componenta_6'] = x_train_mc[:,5]
train_data['componenta_7'] = x_train_mc[:,6]
train_data['componenta_5'] = x_train_mc[:,4]
train_data['componenta_3'] = x_train_mc[:,2]


test_data['componenta_9'] = x_test_mc[:,8]
test_data['componenta_6'] = x_test_mc[:,5]
test_data['componenta_7'] = x_test_mc[:,6]
test_data['componenta_5'] = x_test_mc[:,4]
test_data['componenta_3'] = x_test_mc[:,2]

neworder_1 = ['Square','componenta_9', 'LifeSquare','componenta_6', 'componenta_7', 'componenta_5', 'componenta_3', 'HouseYear', 'Rooms', 'HouseFloor', 'Floor','DistrictId', 'Ecology_1', 'Social_2', 'KitchenSquare', 'Social_3', 'Healthcare_1', 'Social_1', 'Shops_1', 'Helthcare_2']
train_data=train_data_scaled.reindex(columns=neworder_1)
test_data=test_data_scaled.reindex(columns=neworder_1)

In [242]:
train_data.drop(['Healthcare_1'], axis=1, inplace=True)
test_data.drop(['Healthcare_1'], axis=1, inplace=True)

In [243]:
train_data.head(2)

Unnamed: 0,Square,componenta_9,LifeSquare,componenta_6,componenta_7,componenta_5,componenta_3,HouseYear,Rooms,HouseFloor,Floor,DistrictId,Ecology_1,Social_2,KitchenSquare,Social_3,Social_1,Shops_1,Helthcare_2
0,2.276125,-1.025431,1.655706,0.01779,-0.371099,-1.063266,0.239288,106.949278,2.460594,1.476933,1.338979,0.803021,0.748112,1.990716,1.455989,0.209814,1.882301,2.288758,0.0
1,3.115867,-0.91918,2.252176,0.429341,-0.313009,-2.024671,0.113488,107.438127,3.690891,1.476933,1.338979,0.940682,0.000588,2.573005,1.941319,0.041963,2.623813,3.329102,0.669556


In [244]:
train_data_scaled.head(2)

Unnamed: 0,Square,componenta_9,LifeSquare,componenta_6,componenta_7,componenta_5,componenta_3,HouseYear,Rooms,HouseFloor,Floor,DistrictId,Ecology_1,Social_2,KitchenSquare,Social_3,Social_1,Shops_1,Helthcare_2
0,2.276125,-1.025431,1.655706,0.01779,-0.371099,-1.063266,0.239288,106.949278,2.460594,1.476933,1.338979,0.803021,0.748112,1.990716,1.455989,0.209814,1.882301,2.288758,0.0
1,3.115867,-0.91918,2.252176,0.429341,-0.313009,-2.024671,0.113488,107.438127,3.690891,1.476933,1.338979,0.940682,0.000588,2.573005,1.941319,0.041963,2.623813,3.329102,0.669556


In [245]:
x_train, x_test, y_train, y_test = train_test_split(train_data, train_price, test_size=0.30)

In [246]:
lgb = LGBMRegressor(max_depth=30, min_samples_leaf=10, n_estimators=1000, random_state=42)
rfr = RandomForestRegressor(n_estimators=1000, max_depth=18, min_samples_leaf= 2, random_state=42)
boost = GradientBoostingRegressor(n_estimators=600, max_depth=4, min_samples_split=2,
                                           learning_rate=0.1, loss='ls', random_state=42)
voting= VotingRegressor([('rfr', rfr), ('boost', boost), ('lgb', lgb)])

In [247]:
#lgb.fit(x_train, y_train)
#rfr.fit(x_train, y_train)
#boost.fit(x_train, y_train)
voting.fit(x_train, y_train)

VotingRegressor(estimators=[('rfr',
                             RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=18,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=2,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=1000,
                                                   n_jobs=None, oob_score=False,
                                                   random_state=42, verbose=0,
            

In [248]:
y_pred = voting.predict(x_test)

In [249]:
check_test = pd.DataFrame({
    "y_test": y_test,
    "y_pred": y_pred.flatten(),})
check_test.head(2)

Unnamed: 0,y_test,y_pred
1765,96085.133279,215207.924395
3021,179619.355451,210909.876124


In [250]:
r2 = r2_score(check_test["y_test"], check_test["y_pred"])
r2

0.7542834644324774

In [251]:
#feature_importances = pd.DataFrame(zip(x_train.columns, boost.feature_importances_ / boost.feature_importances_.sum()), columns=['feature_name', 'importance'])
#feature_importances.sort_values(by='importance', ascending=False, inplace=True)

In [252]:
#plt.figure(figsize = (16, 8)) 
#plt.barh(y = feature_importances['feature_name'], width = feature_importances['importance'], alpha = 0.5, label = 'LGB')
#plt.legend() 
#plt.show()

In [253]:
y_pred_test = voting.predict(test_data)
submission = pd.DataFrame({
    "Id": test_data_1['Id'],
    "Price": y_pred_test})
submission.head(2)

Unnamed: 0,Id,Price
0,725,164986.778871
1,15856,213042.702226


In [254]:
submission.to_csv('submission_1v1.csv', index=False)