In [7]:
import numpy as np
import pandas as pd
import joblib

In [8]:
bedroom_median = 3.0
bathroom_median = 2.0
built_year_median = 2014.0
no_of_units_median = 366.0

In [9]:
# cut_bins: [0.000e+00, 5.600e+01, 1.090e+02, 1.680e+02, 2.660e+02, 3.660e+02, 4.600e+02, 5.640e+02, 6.960e+02, 1.042e+03, 1.000e+07]
# [28.5, 82.5, 138.5, 217.0, 316.0, 413.0, 512.0, 630.0, 869.0, 1622.5]

cut_bins = [0.000e+00, 5.600e+01, 1.090e+02, 1.680e+02, 2.660e+02, 3.660e+02, 4.600e+02, 5.640e+02, 6.960e+02, 1.042e+03, 1.000e+07]
cut_labels = [28.5, 82.5, 138.5, 217.0, 316.0, 413.0, 512.0, 630.0, 869.0, 1622.5]

In [10]:
def convert_test_set(X_test):
    # Data preprocessing for test data set
    
    #transformaing bedrooms, i.e: from 2+1 to 3
    X_test['bedrooms']=X_test['bedrooms'].apply(lambda x: x if type(x)==float else eval(x))
    
    # Deal with NaN data
    X_test['bedrooms'].fillna(value=bedroom_median, inplace=True)
    X_test['bathrooms'].fillna(value=bathroom_median, inplace=True)
    X_test['built_year'].fillna(value=built_year_median, inplace=True)
    X_test['no_of_units'].fillna(value=no_of_units_median, inplace=True)

    # Check column 'tenure', treat houses that are not 'freehold' as 'leasehold', also fill NaN with 'leasehold'
    X_test['tenure'].where(X_test['tenure']=='freehold', other='leasehold', inplace=True)

    # Drop rows whose area_size is NaN
    X_test.dropna(axis=0, subset=['area_size'], inplace=True)
    
    # Convert some string columns to 0-1 verctors
    # Type
    type_mapping = {'apartment': 0 ,'condominium' : 1}
    X_test['type'] = X_test['type'].map(type_mapping)
    # Tenure
    tenure_mapping = {'freehold':0 ,'leasehold' : 1}
    X_test['tenure'] = X_test['tenure'].map(tenure_mapping)
    
    # For 'build_year', use 2022 - original year to get current age of houses
    X_test['built_year'] = 2022 - X_test['built_year']

    # For no_of_units, divide all houses into 10 categories, manually set lowest limit=0 and highest limit=10000
    X_test['no_of_units'] = pd.cut(X_test['no_of_units'], bins=cut_bins, labels=cut_labels, include_lowest=True).values
    
    # Convert region to numerical values
    region_mapping = {'north region': 0,
                     'west region': 1,
                     'central region': 2,
                     'north-east region': 3,
                     'east region': 4}
    X_test['region'] = X_test['region'].map(region_mapping)


    return

In [11]:
# Import test dataset
data_type = {
            'listing_id': int,
            'name': str,
             'street': str,
             'type': str,
             'model': str,
             'market_segment': str,
             'type_of_area': str,
             'bedrooms': str,
             'bathrooms': float,
             'district': int,
             'region': str,
             'planning_area': str,
             'subszone': str,
             'lat': float,
             'lng': float,
             'tenure': str,
             'built_year': float,
             'no_of_units': float,
             'area_size': float,
             'eco_category': str,
             'accessibility': str,
             'date_listed': str
             }

test_set = pd.read_csv("Data/test.csv", dtype=data_type)


# Make a copy
test_set_copy = test_set.copy()

In [12]:
convert_test_set(test_set_copy)
test_set_copy.reset_index(drop=True, inplace=True)
test_set_copy.head(3)

Unnamed: 0,listing_id,name,street,type,model,market_segment,type_of_area,bedrooms,bathrooms,district,...,subszone,lat,lng,tenure,built_year,no_of_units,area_size,eco_category,accessibility,date_listed
0,1487111,leedon green,leedon heights,1,condominium,ocr,strata,2.0,2.0,10,...,farrer court,1.313566,103.803218,0,8.0,630.0,710.0,uncategorized,guarded,2021-12-12
1,6794066,the line @ tanjong rhu,tanjong rhu road,1,condominium,ocr,strata,3.0,2.0,15,...,tanjong rhu,1.298437,103.884408,0,6.0,138.5,1055.0,uncategorized,guarded,2021-10-12
2,4027017,parc elegance,telok kurau road,0,apartment,ocr,strata,3.0,1.0,15,...,frankel,1.317851,103.908905,0,9.0,138.5,463.0,uncategorized,guarded,2021-12-23


In [13]:
# Import auxiliary data
commercial_centres = pd.read_csv("Data/auxiliary-data/sg-commerical-centres.csv")
hawker_centres = pd.read_csv("Data/auxiliary-data/sg-gov-markets-hawker-centres.csv")
primary_schools = pd.read_csv("Data/auxiliary-data/sg-primary-schools.csv")
secondary_schools = pd.read_csv("Data/auxiliary-data/sg-secondary-schools.csv")
shopping_malls = pd.read_csv("Data/auxiliary-data/sg-shopping-malls.csv")
stations = pd.read_csv("Data/auxiliary-data/sg-train-stations.csv")

In [14]:
# Calculate distance in km with lat & lng
def haversine_batch(train_lng, train_lat, place_lng, place_lat):  # lng1，lat1，lng2，lat2
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)
    """

    place_lat = np.radians(place_lat)
    place_lng = np.radians(place_lng)

    train_lat = np.radians(train_lat)
    train_lng = np.radians(train_lng)

    dlat = place_lat - train_lat
    dlng = place_lng - train_lng

    a = np.sin(dlat / 2) ** 2 + np.cos(train_lat) * np.cos(place_lat) * np.sin(dlng / 2) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371 # Radius of the earth in km
    return c * r



def cal_distance(X, is_train: bool):
    # Find distance to nearest facilities for each house
    facilities = [commercial_centres, hawker_centres, primary_schools, secondary_schools, shopping_malls, stations]

    res = []

    for f in facilities:

        distance = np.full((len(X), 1), np.inf)

        for _, row in f.iterrows():
            d = haversine_batch(X['lng'].values, X['lat'].values, row['lng'], row['lat']).reshape(-1, 1)
            distance = np.minimum(distance, d)
        
        res.append(distance)
        
    if is_train:
        res.append(X['price'].values.reshape(-1, 1))
        pd_distance = pd.DataFrame(data=np.squeeze(np.asarray(res)).swapaxes(0, 1),
                              columns=['to_commercial_centres', 'to_hawker_centres', 'to_primary_schools',
                                       'to_secondary_schools', 'to_shopping_malls', 'to_train_stations', 'price'])
    else:
        pd_distance = pd.DataFrame(data=np.squeeze(np.asarray(res)).swapaxes(0, 1),
                          columns=['to_commercial_centres', 'to_hawker_centres', 'to_primary_schools',
                                   'to_secondary_schools', 'to_shopping_malls', 'to_train_stations'])
    return pd_distance

In [15]:
test_distance = cal_distance(test_set_copy, is_train=False)
test_distance.reset_index(drop=True, inplace=True)

In [16]:
# Test set
X_test_without_auxiliary_data = test_set_copy[['type', 'bedrooms', 'bathrooms', 'district', 'region', 'tenure', 'built_year', 'no_of_units', 'area_size']]
X_test_plus_auxiliary_data = pd.concat([X_test_without_auxiliary_data, test_distance], axis=1)
X_test_plus_auxiliary_data.head(3)

Unnamed: 0,type,bedrooms,bathrooms,district,region,tenure,built_year,no_of_units,area_size,to_commercial_centres,to_hawker_centres,to_primary_schools,to_secondary_schools,to_shopping_malls,to_train_stations
0,1,2.0,2.0,10,2,0,8.0,630.0,710.0,1.659712,0.410398,0.980559,1.048619,1.128913,0.65427
1,1,3.0,2.0,15,2,0,6.0,138.5,1055.0,2.415213,0.446516,1.469982,0.249159,0.969806,0.888254
2,0,3.0,1.0,15,4,0,9.0,138.5,463.0,1.712861,0.587265,0.893951,1.34346,1.475109,0.572349


In [17]:
# For area and bathrooms model
X_test = X_test_plus_auxiliary_data.drop(['region'],axis=1)
X_test.head(5)

Unnamed: 0,type,bedrooms,bathrooms,district,tenure,built_year,no_of_units,area_size,to_commercial_centres,to_hawker_centres,to_primary_schools,to_secondary_schools,to_shopping_malls,to_train_stations
0,1,2.0,2.0,10,0,8.0,630.0,710.0,1.659712,0.410398,0.980559,1.048619,1.128913,0.65427
1,1,3.0,2.0,15,0,6.0,138.5,1055.0,2.415213,0.446516,1.469982,0.249159,0.969806,0.888254
2,0,3.0,1.0,15,0,9.0,138.5,463.0,1.712861,0.587265,0.893951,1.34346,1.475109,0.572349
3,1,2.0,1.0,17,1,92.0,413.0,645.0,1.862085,2.334125,0.709727,0.528898,2.319693,1.249331
4,1,4.0,6.0,1,1,8.0,316.0,6200.0,1.179363,0.327226,1.075392,1.680849,0.394375,0.409125


In [18]:
# For region model
X_test_region = X_test_plus_auxiliary_data.drop(['district'],axis=1)
X_test_region.head(5)

Unnamed: 0,type,bedrooms,bathrooms,region,tenure,built_year,no_of_units,area_size,to_commercial_centres,to_hawker_centres,to_primary_schools,to_secondary_schools,to_shopping_malls,to_train_stations
0,1,2.0,2.0,2,0,8.0,630.0,710.0,1.659712,0.410398,0.980559,1.048619,1.128913,0.65427
1,1,3.0,2.0,2,0,6.0,138.5,1055.0,2.415213,0.446516,1.469982,0.249159,0.969806,0.888254
2,0,3.0,1.0,4,0,9.0,138.5,463.0,1.712861,0.587265,0.893951,1.34346,1.475109,0.572349
3,1,2.0,1.0,4,1,92.0,413.0,645.0,1.862085,2.334125,0.709727,0.528898,2.319693,1.249331
4,1,4.0,6.0,2,1,8.0,316.0,6200.0,1.179363,0.327226,1.075392,1.680849,0.394375,0.409125


## Get level 1 results

### 1. Area models

In [19]:
model_root_path = "Models/Area_models/gbdt"

area_model_0 = joblib.load(model_root_path + "_area_0.pkl")
area_model_1 = joblib.load(model_root_path + "_area_1.pkl")
area_model_2 = joblib.load(model_root_path + "_area_2.pkl")
area_model_3 = joblib.load(model_root_path + "_area_3.pkl")
area_model_4 = joblib.load(model_root_path + "_area_4.pkl")
area_model_5 = joblib.load(model_root_path + "_area_5.pkl")
area_model_6 = joblib.load(model_root_path + "_area_6.pkl")
area_model_7 = joblib.load(model_root_path + "_area_7.pkl")
area_model_8 = joblib.load(model_root_path + "_area_8.pkl")
area_model_9 = joblib.load(model_root_path + "_area_9.pkl")
area_model_10 = joblib.load(model_root_path + "_area_10.pkl")
area_model_11 = joblib.load(model_root_path + "_area_11.pkl")
area_model_12 = joblib.load(model_root_path + "_area_12.pkl")


Y_area_gbdt_pred = np.zeros(shape=(len(X_test),))
area_split_points = [0, 500, 750, 1000, 1200, 1500, 2000, 2500, 3000, 4000, 5000, 7500, 10000]
area_models = [area_model_0, area_model_1, area_model_2, area_model_3, area_model_4,
               area_model_5, area_model_6, area_model_7, area_model_8, area_model_9,
               area_model_10, area_model_11, area_model_12]

for i in range(len(X_test)):
    sample_input = X_test.iloc[i,:]
    sample_area = sample_input['area_size']
    sample_input = sample_input.values.reshape(1, -1)
    
    isFound = False
    for j in range(len(area_models)-2):
        if area_split_points[j] <= sample_area < area_split_points[j+1]:
            Y_area_gbdt_pred[i] = area_models[j].predict(sample_input)
            isFound = True
            break
    if not isFound:
        Y_area_gbdt_pred[i] = area_models[-1].predict(sample_input)


Y_area_gbdt_pred = Y_area_gbdt_pred.reshape(-1,1)
print(Y_area_gbdt_pred)

array([[1970426.28067219],
       [2561129.34527537],
       [ 782221.56383361],
       ...,
       [1680584.34357039],
       [ 858353.75069637],
       [8217734.69071368]])

In [20]:
model_root_path = "Models/Area_models/xgboost"

area_model_0 = joblib.load(model_root_path + "_area_0.pkl")
area_model_1 = joblib.load(model_root_path + "_area_1.pkl")
area_model_2 = joblib.load(model_root_path + "_area_2.pkl")
area_model_3 = joblib.load(model_root_path + "_area_3.pkl")
area_model_4 = joblib.load(model_root_path + "_area_4.pkl")
area_model_5 = joblib.load(model_root_path + "_area_5.pkl")
area_model_6 = joblib.load(model_root_path + "_area_6.pkl")
area_model_7 = joblib.load(model_root_path + "_area_7.pkl")
area_model_8 = joblib.load(model_root_path + "_area_8.pkl")
area_model_9 = joblib.load(model_root_path + "_area_9.pkl")
area_model_10 = joblib.load(model_root_path + "_area_10.pkl")
area_model_11 = joblib.load(model_root_path + "_area_11.pkl")
area_model_12 = joblib.load(model_root_path + "_area_12.pkl")


Y_area_xgboost_pred = np.zeros(shape=(len(X_test),))
area_split_points = [0, 500, 750, 1000, 1200, 1500, 2000, 2500, 3000, 4000, 5000, 7500, 10000]
area_models = [area_model_0, area_model_1, area_model_2, area_model_3, area_model_4,
               area_model_5, area_model_6, area_model_7, area_model_8, area_model_9,
               area_model_10, area_model_11, area_model_12]

for i in range(len(X_test)):
    sample_input = X_test.iloc[i,:]
    sample_area = sample_input['area_size']
    sample_input = sample_input.values.reshape(1, -1)
    
    isFound = False
    for j in range(len(area_models)-2):
        if area_split_points[j] <= sample_area < area_split_points[j+1]:
            Y_area_xgboost_pred[i] = area_models[j].predict(sample_input)
            isFound = True
            break
    if not isFound:
        Y_area_xgboost_pred[i] = area_models[-1].predict(sample_input)

Y_area_xgboost_pred = Y_area_xgboost_pred.reshape(-1,1)
Y_area_xgboost_pred

array([[2010357.75  ],
       [2570778.5   ],
       [ 781892.1875],
       ...,
       [1634656.25  ],
       [ 869455.5   ],
       [7885923.    ]])

### 2. Region models

In [15]:
# old version
# model_dict = {
#     0: "gbdt",
#     1: "xgboost",
#     2: "lightgbm"

# }

# region_model_index = 1
# model_root_path = "Models/Region_models/" + model_dict[region_model_index]

# region_model_0 = joblib.load(model_root_path + "_region_0.pkl")
# region_model_1 = joblib.load(model_root_path + "_region_1.pkl")
# region_model_2 = joblib.load(model_root_path + "_region_2.pkl")
# region_model_3 = joblib.load(model_root_path + "_region_3.pkl")
# region_model_4 = joblib.load(model_root_path + "_region_4.pkl")

# Y_region_models_pred = np.zeros(shape=(len(X_test_region),))
# region_split_points = [0,1,2,3,4]
# region_models = [region_model_0, region_model_1, region_model_2, region_model_3, region_model_4]

# for i in range(len(X_test_region)):
#     sample_input = X_test_region.iloc[i,:]
#     sample_region = sample_input['region']
#     sample_input = sample_input.values.reshape(1, -1)
    
    
#     isFound = False
#     for j in range(len(region_models)-2):
#         if region_split_points[j] <= sample_region < region_split_points[j+1]:
#             Y_region_models_pred[i] = region_models[j].predict(sample_input)
#             isFound = True
#             break
#     if not isFound:
#         Y_region_models_pred[i] = region_models[-1].predict(sample_input)

# Y_region_models_pred = Y_region_models_pred.reshape(-1,1)
# Y_region_models_pred

In [21]:
model_root_path = "Models/Region_models/gbdt" 

region_model_0 = joblib.load(model_root_path + "_region_0.pkl")
region_model_1 = joblib.load(model_root_path + "_region_1.pkl")
region_model_2 = joblib.load(model_root_path + "_region_2.pkl")
region_model_3 = joblib.load(model_root_path + "_region_3.pkl")
region_model_4 = joblib.load(model_root_path + "_region_4.pkl")

Y_region_gbdt_pred = np.zeros(shape=(len(X_test_region),))
region_split_points = [0,1,2,3,4]
region_models = [region_model_0, region_model_1, region_model_2, region_model_3, region_model_4]

for i in range(len(X_test_region)):
    sample_input = X_test_region.iloc[i,:]
    sample_region = sample_input['region']
    sample_input = sample_input.values.reshape(1, -1)
    
    
    isFound = False
    for j in range(len(region_models)-2):
        if region_split_points[j] <= sample_region < region_split_points[j+1]:
            Y_region_gbdt_pred[i] = region_models[j].predict(sample_input)
            isFound = True
            break
    if not isFound:
        Y_region_gbdt_pred[i] = region_models[-1].predict(sample_input)

Y_region_gbdt_pred = Y_region_gbdt_pred.reshape(-1,1)
Y_region_gbdt_pred

array([[1151276.15622788],
       [2161178.9297561 ],
       [ 910699.51275227],
       ...,
       [1401468.51601926],
       [ 976814.54930105],
       [3297987.24679598]])

In [22]:
model_root_path = "Models/Region_models/xgboost" 

region_model_0 = joblib.load(model_root_path + "_region_0.pkl")
region_model_1 = joblib.load(model_root_path + "_region_1.pkl")
region_model_2 = joblib.load(model_root_path + "_region_2.pkl")
region_model_3 = joblib.load(model_root_path + "_region_3.pkl")
region_model_4 = joblib.load(model_root_path + "_region_4.pkl")

Y_region_xgboost_pred = np.zeros(shape=(len(X_test_region),))
region_split_points = [0,1,2,3,4]
region_models = [region_model_0, region_model_1, region_model_2, region_model_3, region_model_4]

for i in range(len(X_test_region)):
    sample_input = X_test_region.iloc[i,:]
    sample_region = sample_input['region']
    sample_input = sample_input.values.reshape(1, -1)
    
    
    isFound = False
    for j in range(len(region_models)-2):
        if region_split_points[j] <= sample_region < region_split_points[j+1]:
            Y_region_xgboost_pred[i] = region_models[j].predict(sample_input)
            isFound = True
            break
    if not isFound:
        Y_region_xgboost_pred[i] = region_models[-1].predict(sample_input)

Y_region_xgboost_pred = Y_region_xgboost_pred.reshape(-1,1)
Y_region_xgboost_pred

array([[1240808.625 ],
       [1960900.    ],
       [1358296.    ],
       ...,
       [1559363.    ],
       [1010178.0625],
       [2868360.25  ]])

### 3. Bathroom models

In [23]:
model_root_path = "Models/Bathrooms_models/gbdt"

bathrooms_model_0 = joblib.load(model_root_path + "_bathrooms_0.pkl")
bathrooms_model_1 = joblib.load(model_root_path + "_bathrooms_1.pkl")
bathrooms_model_2 = joblib.load(model_root_path + "_bathrooms_2.pkl")
bathrooms_model_3 = joblib.load(model_root_path + "_bathrooms_3.pkl")
bathrooms_model_4 = joblib.load(model_root_path + "_bathrooms_4.pkl")

Y_bathroom_gbdt_pred = np.zeros(shape=(len(X_test),))
bathrooms_split_points = [1, 2, 3, 4, 5]
bathroom_models = [bathrooms_model_0, bathrooms_model_1, bathrooms_model_2, bathrooms_model_3, bathrooms_model_4]

for i in range(len(X_test)):
    sample_input = X_test.iloc[i,:]
    sample_bathroom = sample_input['bathrooms']
    sample_input = sample_input.values.reshape(1, -1)
    
    
    isFound = False
    for j in range(len(bathroom_models)-2):
        if bathrooms_split_points[j] <= sample_bathroom < bathrooms_split_points[j+1]:
            Y_bathroom_gbdt_pred[i] = bathroom_models[j].predict(sample_input)
            isFound = True
            break
    if not isFound:
        Y_bathroom_gbdt_pred[i] = bathroom_models[-1].predict(sample_input)

Y_bathroom_gbdt_pred = Y_bathroom_gbdt_pred.reshape(-1,1)
Y_bathroom_gbdt_pred

array([[2025950.98866798],
       [2580816.85716467],
       [ 794912.42614716],
       ...,
       [1686846.13013011],
       [ 934844.16457352],
       [6174863.48170608]])

In [24]:
model_root_path = "Models/Bathrooms_models/xgboost"

bathrooms_model_0 = joblib.load(model_root_path + "_bathrooms_0.pkl")
bathrooms_model_1 = joblib.load(model_root_path + "_bathrooms_1.pkl")
bathrooms_model_2 = joblib.load(model_root_path + "_bathrooms_2.pkl")
bathrooms_model_3 = joblib.load(model_root_path + "_bathrooms_3.pkl")
bathrooms_model_4 = joblib.load(model_root_path + "_bathrooms_4.pkl")

Y_bathroom_xgboost_pred = np.zeros(shape=(len(X_test),))
bathrooms_split_points = [1, 2, 3, 4, 5]
bathroom_models = [bathrooms_model_0, bathrooms_model_1, bathrooms_model_2, bathrooms_model_3, bathrooms_model_4]

for i in range(len(X_test)):
    sample_input = X_test.iloc[i,:]
    sample_bathroom = sample_input['bathrooms']
    sample_input = sample_input.values.reshape(1, -1)
    
    
    isFound = False
    for j in range(len(bathroom_models)-2):
        if bathrooms_split_points[j] <= sample_bathroom < bathrooms_split_points[j+1]:
            Y_bathroom_xgboost_pred[i] = bathroom_models[j].predict(sample_input)
            isFound = True
            break
    if not isFound:
        Y_bathroom_xgboost_pred[i] = bathroom_models[-1].predict(sample_input)

Y_bathroom_xgboost_pred = Y_bathroom_xgboost_pred.reshape(-1,1)
Y_bathroom_xgboost_pred

array([[2004229.375 ],
       [2590853.25  ],
       [ 762398.75  ],
       ...,
       [1681273.25  ],
       [ 969276.1875],
       [6863993.    ]])

## Get base models' results 

In [26]:
# Combine all results together to form the new trainining set
base_models_results = np.hstack([Y_area_gbdt_pred, Y_area_xgboost_pred, 
                                 Y_region_gbdt_pred, Y_region_xgboost_pred,
                                 Y_bathroom_gbdt_pred, Y_bathroom_xgboost_pred])
base_models_results

array([[1970426.28067219, 2010357.75      , 1151276.15622788,
        1240808.625     , 2025950.98866798, 2004229.375     ],
       [2561129.34527537, 2570778.5       , 2161178.9297561 ,
        1960900.        , 2580816.85716467, 2590853.25      ],
       [ 782221.56383361,  781892.1875    ,  910699.51275227,
        1358296.        ,  794912.42614716,  762398.75      ],
       ...,
       [1680584.34357039, 1634656.25      , 1401468.51601926,
        1559363.        , 1686846.13013011, 1681273.25      ],
       [ 858353.75069637,  869455.5       ,  976814.54930105,
        1010178.0625    ,  934844.16457352,  969276.1875    ],
       [8217734.69071368, 7885923.        , 3297987.24679598,
        2868360.25      , 6174863.48170608, 6863993.        ]])

### Linear Regression Fusion

In [20]:
lr_model = joblib.load("Models/LinearRegression/LR_area0_region1_bathroom1.pkl")

In [27]:
kaggle_Y_pred = lr_model.predict(base_models_results).ravel()

In [28]:
kaggle_Y_pred

array([2006437.77344695, 2582960.196597  ,  781703.31887252, ...,
       1685441.46864613,  977899.15396459, 7107815.60094394])

In [29]:
pred_df = pd.Series(kaggle_Y_pred, name='Predicted') 

# Save the results
pred_df.to_csv('Results/LR_area0_region1_bathroom1_01.csv', header=['Predicted'], index_label=['Id'])

## Weight Fusion

In [27]:
base_models_results

array([[1970426.28067219, 2010357.75      , 1151276.15622788,
        1240808.625     , 2025950.98866798, 2004229.375     ],
       [2561129.34527537, 2570778.5       , 2161178.9297561 ,
        1960900.        , 2580816.85716467, 2590853.25      ],
       [ 782221.56383361,  781892.1875    ,  910699.51275227,
        1358296.        ,  794912.42614716,  762398.75      ],
       ...,
       [1680584.34357039, 1634656.25      , 1401468.51601926,
        1559363.        , 1686846.13013011, 1681273.25      ],
       [ 858353.75069637,  869455.5       ,  976814.54930105,
        1010178.0625    ,  934844.16457352,  969276.1875    ],
       [8217734.69071368, 7885923.        , 3297987.24679598,
        2868360.25      , 6174863.48170608, 6863993.        ]])

In [38]:
weigts = np.asarray([0.15, 0.7, 0.15])

In [40]:
kaggle_Y_pred = np.sum(weigts * base_models_results, axis=1)
kaggle_Y_pred

array([2071983.9794859 , 2599304.9392913 ,  789862.81554673, ...,
       1645067.85153556,  891254.78732983, 7329683.31090452])

In [42]:
pred_df = pd.Series(kaggle_Y_pred, name='Predicted') 

# Save the results
pred_df.to_csv('Results/Weights_area0_region1_bathroom1_01.csv', header=['Predicted'], index_label=['Id'])

## Other methods

In [72]:
base_models_results = np.hstack([Y_area_gbdt_pred, Y_area_xgboost_pred, 
                                 Y_region_gbdt_pred, Y_region_xgboost_pred,
                                 Y_bathroom_gbdt_pred, Y_bathroom_xgboost_pred])
base_models_results

array([[2008164.154906  , 1983721.875     , 2034450.23055069,
        2100178.5       , 2025950.98866798, 2004229.375     ],
       [2561129.34527537, 2587605.        , 2599746.68387621,
        2609296.5       , 2580816.85716467, 2590853.25      ],
       [ 809964.06197822,  702074.9375    ,  977851.49261967,
         791440.5625    ,  794912.42614716,  762398.75      ],
       ...,
       [1680584.34357039, 1649925.375     , 1593039.82852439,
        1629698.875     , 1686846.13013011, 1681273.25      ],
       [ 962423.97803218,  989272.5       ,  890887.16012974,
         859285.375     ,  934844.16457352,  969276.1875    ],
       [8481494.07269681, 7226981.        , 7819827.74935557,
        7182657.5       , 6174863.48170608, 6863993.        ]])

In [84]:
kaggle_Y_pred_mean = np.mean(base_models_results, axis=1)
kaggle_Y_pred_mean

array([2037524.00996867, 2587093.03175846,  787934.45815941, ...,
       1663852.15619013,  930328.51351073, 7509381.52423227])

In [86]:
pred_df = pd.Series(kaggle_Y_pred, name='Predicted') 

# Save the results
pred_df.to_csv('Results/Mean_area0_region1_bathroom1_01.csv', header=['Predicted'], index_label=['Id'])

In [69]:
kaggle_Y_pred_median = np.median(base_models_results,axis=1)
kaggle_Y_pred_median

array([2017057.57178699, 2589229.125     ,  793176.49432358, ...,
       1665254.85928519,  948634.07130285, 7204819.25      ])

In [73]:
pred_df = pd.Series(kaggle_Y_pred_median, name='Predicted') 

# Save the results
pred_df.to_csv('Results/Median6.csv', header=['Predicted'], index_label=['Id'])