# Moscow Housing Prediction: Group 60

### Team members 

* Emil Ekornrød (Student ID: 506828) 
* Torkild Sandnes Grøstad (Student ID: 506595)
* Henrik Gundersen (Student ID: 488635) 



# 0: Loading Data

In [273]:
# !pip install lightgbm
# !pip install xgboost
# !pip install catboost

import numpy as np 
import pandas as pd 
import seaborn as sns 
import geopy.distance
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
import matplotlib.pyplot as plt 
from pointpats import centrography
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor

np.random.seed(123)
sns.set_style('darkgrid')
pd.set_option('display.max_colwidth', None)

In [5]:
# Load the data
apartments = pd.read_csv('data/apartments_train.csv')
buildings = pd.read_csv('data/buildings_train.csv')
data = pd.merge(apartments, buildings.set_index('id'), how='left', left_on='building_id', right_index=True)
apartments_test = pd.read_csv('data/apartments_test.csv')
buildings_test = pd.read_csv('data/buildings_test.csv')
data_test = pd.merge(apartments_test, buildings_test.set_index('id'), how='left', left_on='building_id', right_index=True)


# 1: Data Cleaning

In [6]:
# Dataframe with train and test for imputing NaN values
cleaning = pd.concat([data, data_test], axis=0)
cleaning.shape

(33222, 34)

In [7]:
# For LightGBM v3 with no preprocessing
raw_data = cleaning.copy()

In [8]:
cleaning.drop_duplicates();

### Outliers, duplicates and inconsistent observations

#### area_total

In [9]:
# drop outliers
cleaning = cleaning[cleaning['area_total'] < 1500]

In [10]:
# Looks like collection error in district 8.0, strategy -> set kitchen and living to NaN to impute later
cleaning_df = cleaning[cleaning['area_total'] < cleaning['area_living']]
cleaning_df = cleaning_df[cleaning_df['district'] == 8.0]
ids = cleaning_df['id'].values

for ide in ids:
    cleaning.loc[(cleaning['id'] == ide), 'area_living'] = np.nan
    cleaning.loc[(cleaning['id'] == ide), 'area_kitchen'] = np.nan

In [11]:
# Want to check district 8 area_kitchen and area_living
cleaning_df = cleaning[cleaning['district'] == 8.0]
cleaning_df = cleaning_df[cleaning_df['area_living'] == 80]
cleaning_df = cleaning_df[cleaning_df['area_kitchen'] == 20]
ids = cleaning_df['id'].values

# Set all area_kitchen, area_living == 20, 80 in district 8.0 to NaN to impute later
for ide in ids:
    cleaning.loc[(cleaning['id'] == ide), 'area_living'] = np.nan
    cleaning.loc[(cleaning['id'] == ide), 'area_kitchen'] = np.nan

In [12]:
# Set rest of inconsistent, -> area_living equal area_total
cleaning_df = cleaning[cleaning['area_total'] < cleaning['area_living']]
ids = cleaning_df['id'].values

for ide in ids:
    cleaning.loc[(cleaning['id'] == ide), 'area_living'] = cleaning.loc[(cleaning['id'] == ide), 'area_total']

In [13]:
cleaning.loc[1107, 'area_kitchen'] = np.nan

In [14]:
cleaning_df = cleaning[cleaning['area_kitchen'] > cleaning['area_living']]
ids = cleaning_df['id'].values

for ide in ids:
    cleaning.loc[(cleaning['id'] == ide), 'area_living'] = cleaning.loc[(cleaning['id'] == ide), 'area_total']

In [15]:
# Examine floor higher than stories
cleaning_df = cleaning[cleaning['floor'] > cleaning['stories']]
cleaning_df[['floor', 'stories', 'building_id']]
ids = cleaning_df['id'].values

# Set floor values that are above stories to be stories / 2
for ide in ids:
    cleaning.loc[(cleaning['id'] == ide), 'floor'] = cleaning.loc[(cleaning['id'] == ide), 'stories'] / 2    

In [16]:
# Correcting for high number of rooms on small area
cleaning.loc[10831, 'rooms'] = 2
cleaning.loc[10830, 'rooms'] = 2
cleaning.loc[(cleaning['id'] == 25049), 'rooms'] = 3
cleaning.loc[(cleaning['id'] == 25083), 'rooms'] = 3

In [17]:
# Check apartments in same building and change if out of the ordinary
cleaning.loc[2358, 'bathrooms_private'] = 1
cleaning.loc[8664, 'bathrooms_private'] = 1
cleaning.loc[17413, 'bathrooms_private'] = 1
cleaning.loc[(cleaning['id'] == 26821), 'bathrooms_private'] = 1

In [18]:
# Check ceiling feature
cleaning.loc[2853, 'ceiling'] = 3.5
cleaning_df = cleaning[cleaning['ceiling'] >= 100]
ids = cleaning_df['id'].values

for ide in ids:
    cleaning.loc[(cleaning['id'] == ide), 'ceiling'] = cleaning.loc[(cleaning['id'] == ide), 'ceiling'] / 100

cleaning_df = cleaning[cleaning['ceiling'] > 10]
ids = cleaning_df['id'].values

for ide in ids:
    cleaning.loc[(cleaning['id'] == ide), 'ceiling'] = cleaning.loc[(cleaning['id'] == ide), 'ceiling'] / 10

### Missing Data



In [19]:
def fast_mode(df, key_cols, value_col):
    return (df.groupby(key_cols + [value_col]).size() 
              .to_frame('counts').reset_index() 
              .sort_values('counts', ascending=False) 
              .drop_duplicates(subset=key_cols)).drop(columns='counts')

### Missing data: layout, ceiling

In [20]:
# Too many missing values, drop columns
cleaning = cleaning.drop(columns=['layout'])

### Missing data: District

In [21]:
data_district = cleaning[['longitude', 'latitude', 'district', 'building_id']]
data_district = data_district[data_district['district'].isnull()]
coordinates_missing = []

for building in data_district.building_id.unique():
    df_building = cleaning[cleaning['building_id'] == building]    
    coordinates_missing.append((df_building['latitude'].unique()[0], df_building['longitude'].unique()[0], building))


Manually see where these coordinates are on the map to determine district:

In [22]:
building_to_district = {
    4162: 7.0,
    2265: 7.0,
    1647: 7.0,
    6403: 7.0,
    183: 7.0,
    926: 3.0,
    4636: "check further, VC bird international airport",
    4412: "check further, VC bird international airport",
    4202: "check further, outskirts of city in kirgistan",
    7317: 6.0,
    8811: "check further, all the way towards japan",
    5667: "check further, all the way towards japan",
    6879: 3.0
}

In [23]:
# Impute districts found
cleaning.loc[(cleaning['building_id'] == 4162), 'district'] = 7.0
cleaning.loc[(cleaning['building_id'] == 2265), 'district'] = 7.0
cleaning.loc[(cleaning['building_id'] == 1647), 'district'] = 7.0
cleaning.loc[(cleaning['building_id'] == 6403), 'district'] = 7.0
cleaning.loc[(cleaning['building_id'] == 183), 'district'] = 7.0
cleaning.loc[(cleaning['building_id'] == 926), 'district'] = 3.0
cleaning.loc[(cleaning['building_id'] == 7317), 'district'] = 6.0
cleaning.loc[(cleaning['building_id'] == 6879), 'district'] = 3.0

data_district = cleaning[['longitude', 'latitude', 'district', 'building_id']]
data_district = data_district[data_district['district'].isnull()]

In [24]:
# Check weird if weird coordinates is in train or test, if train drop
building_4636 = data[data['building_id'] == 4636]
building_4412 = data[data['building_id'] == 4412]
building_4202 = data[data['building_id'] == 4202]
building_8811 = data[data['building_id'] == 8811]
building_5667 = data[data['building_id'] == 5667]
building_3803 = data[data['building_id'] == 3803]

In [25]:
# Examine building with weird coordinates
weird_building = [4636, 4412, 4202, 8811, 5667, 3803]

addresses_missing = []

for building in weird_building:
    df_building = cleaning[cleaning['building_id'] == building]
    addresses_missing.append((df_building['street'].unique()[0], 
                              df_building['address'].unique()[0], 
                              building))

In [26]:
# Manually check address for district
weird_building_to_district = {
    3803: 11.0,
    4636: 11.0,
    4412: 11.0,
    4202: 11.0,
    8811: 11.0,
    5667: 11.0,
}

In [27]:
# Fixing values latitude, longitude
cleaning.loc[(cleaning['building_id'] == 3803), 'latitude'] = 55.56729707552548
cleaning.loc[(cleaning['building_id'] == 3803), 'longitude'] = 37.48142051265652

cleaning.loc[(cleaning['building_id'] == 4636), 'latitude'] = 55.554940129860185
cleaning.loc[(cleaning['building_id'] == 4636), 'longitude'] = 37.53089613751531
cleaning.loc[(cleaning['building_id'] == 4412), 'latitude'] = 55.554940129860185
cleaning.loc[(cleaning['building_id'] == 4412), 'longitude'] = 37.53089613751531

cleaning.loc[(cleaning['building_id'] == 4202), 'latitude'] = 55.48214820099864
cleaning.loc[(cleaning['building_id'] == 4202), 'longitude'] = 36.990398748020795

cleaning.loc[(cleaning['building_id'] == 8811), 'latitude'] = 55.48214820099864
cleaning.loc[(cleaning['building_id'] == 8811), 'longitude'] = 36.990398748020795
cleaning.loc[(cleaning['building_id'] == 5667), 'latitude'] = 55.48214820099864
cleaning.loc[(cleaning['building_id'] == 5667), 'longitude'] = 36.990398748020795


In [28]:
for building in weird_building:
    cleaning.loc[(cleaning['building_id'] == building), 'district'] = 11.0

### Missing data: impute with mode by district

In [29]:
def impute_missing_by_mode_in_district(cleaning, column):
    mode_by_district = fast_mode(cleaning, ['district'], column)
    mode_by_district.set_index('district', inplace=True)

    districts = cleaning.district.unique()
    for district in districts:
        cleaning.loc[(cleaning['district'] == district) & (cleaning[column].isna()), column] = mode_by_district.loc[district,column]

    return cleaning

In [30]:
cleaning = impute_missing_by_mode_in_district(cleaning, 'seller')
cleaning = impute_missing_by_mode_in_district(cleaning, 'bathrooms_shared')
cleaning = impute_missing_by_mode_in_district(cleaning, 'bathrooms_private')
cleaning = impute_missing_by_mode_in_district(cleaning, 'balconies')
cleaning = impute_missing_by_mode_in_district(cleaning, 'constructed')
cleaning = impute_missing_by_mode_in_district(cleaning, 'loggias')
cleaning = impute_missing_by_mode_in_district(cleaning, 'phones')
cleaning = impute_missing_by_mode_in_district(cleaning, 'new')
cleaning = impute_missing_by_mode_in_district(cleaning, 'ceiling')
cleaning = impute_missing_by_mode_in_district(cleaning, 'condition')
cleaning = impute_missing_by_mode_in_district(cleaning, 'material')
cleaning = impute_missing_by_mode_in_district(cleaning, 'elevator_without')
cleaning = impute_missing_by_mode_in_district(cleaning, 'elevator_passenger')
cleaning = impute_missing_by_mode_in_district(cleaning, 'elevator_service')

### Missing data: area_living

In [31]:
missing_living = cleaning[cleaning['area_living'].isnull()]

# Impute missing value by mean percent of total is living by district
cleaning['living_to_total'] = cleaning['area_living'] / cleaning['area_total']

mean_living_to_total_by_district = cleaning[['living_to_total', 'district']].groupby('district').mean()

cleaning['mean_living_to_total_by_district'] = 0

districts = cleaning.district.unique()

for district in districts:
    cleaning.loc[(cleaning['district'] == district), 'mean_living_to_total_by_district'] = mean_living_to_total_by_district.loc[district,'living_to_total']

cleaning['area_living'].fillna(cleaning['area_total']*cleaning['mean_living_to_total_by_district'], inplace=True)

for ide in ids:
    cleaning.loc[(cleaning['area_living'] == 0), 'area_living'] = cleaning.loc[(cleaning['id'] == ide),'area_total']*cleaning.loc[(cleaning['id'] == ide),'mean_living_to_total_by_district']

In [32]:
# area_living == area_total
cleaning_df = cleaning[cleaning['area_living'] == cleaning['area_total']]
ids = cleaning_df['id'].values

for ide in ids:
    cleaning.loc[(cleaning['id'] == ide), 'area_living'] = cleaning.loc[(cleaning['id'] == ide),'area_total']*cleaning.loc[(cleaning['id'] == ide),'mean_living_to_total_by_district']
    
cleaning = cleaning.drop(columns=['mean_living_to_total_by_district', 'living_to_total'])    

### Missing data: ceiling

In [33]:
cleaning_df = cleaning[cleaning['ceiling'] == 0]
ids = cleaning_df['id'].values

for ide in ids:
    cleaning.loc[(cleaning['id'] == ide), 'ceiling'] = np.nan
    
cleaning = impute_missing_by_mode_in_district(cleaning, 'ceiling')

### Missing data: area_kitchen

In [34]:
missing_kitchen = cleaning[cleaning['area_kitchen'].isnull()]

In [35]:
# Impute missing value by mean percent of total is kitchen by district
cleaning['kitchen_to_total'] = cleaning['area_kitchen'] / cleaning['area_total']

mean_kitchen_to_total_by_district = cleaning[['kitchen_to_total', 'district']].groupby('district').mean()

cleaning['mean_kitchen_to_total_by_district'] = 0

for district in districts:
    cleaning.loc[(cleaning['district'] == district), 'mean_kitchen_to_total_by_district'] = mean_kitchen_to_total_by_district.loc[district,'kitchen_to_total']

cleaning['area_kitchen'].fillna(cleaning['area_total']*cleaning['mean_kitchen_to_total_by_district'], inplace=True)
cleaning = cleaning.drop(columns=['mean_kitchen_to_total_by_district', 'kitchen_to_total'])

### Missing data: Impute with "missing" category

In [36]:
def impute_missing_by_new_category(df, column):
    df.loc[(df[column].isnull()), column] = 'Missing'
    return df

In [37]:
cleaning = impute_missing_by_new_category(cleaning, 'parking')
cleaning = impute_missing_by_new_category(cleaning, 'garbage_chute')
cleaning = impute_missing_by_new_category(cleaning, 'heating')
#cleaning = impute_missing_by_new_category(cleaning, 'condition')
cleaning = impute_missing_by_new_category(cleaning, 'new')
cleaning = impute_missing_by_new_category(cleaning, 'windows_street')
cleaning = impute_missing_by_new_category(cleaning, 'windows_court')

# 2. Feature Engineering

In [38]:
data_fe = cleaning

In [39]:
data_fe['price_log'] = np.log10(data_fe.price)

In [40]:
data_fe['area_total_log'] = np.log(data_fe['area_total'] + 1)

### Feature Engineering: room_size = area_total / rooms

In [41]:
data_fe['room_size'] = data_fe['area_total'] / data_fe['rooms']

In [42]:
data_fe['room_size_log'] = np.log(data_fe['room_size'] + 1) 

### Feature Engineering: distance_to_mean (distance to "hottest place to be")

In [47]:
value = 5e7
data_price_fltr = cleaning[cleaning['price'] > value]

mean_center = centrography.mean_center(data_price_fltr[['longitude', 'latitude']])
median_center = centrography.euclidean_median(data_price_fltr[['longitude', 'latitude']])

# Create apartment point tuples
fixed_mean_center = (mean_center[1], mean_center[0])

def create_point_tuples_column(data):
    point_tuples_list = []
    for index, row in data.iterrows(): 
        if np.isnan(row['latitude']) or np.isnan(row['longitude']):
            point_tuples_list.append((0.0, 0.0))
        else:  
            this_point = (row['latitude'], row['longitude'])
            point_tuples_list.append(this_point) 
    data['point_tuples'] = point_tuples_list
    return data['point_tuples'], fixed_mean_center

#Calculate the distance from the mean center to all apartments
def calculate_distances(data):
    point_tuples_column, fixed_mean_center = create_point_tuples_column(data)
    data['distance_mean'] = point_tuples_column.apply(lambda x: geopy.distance.geodesic(fixed_mean_center, x).km)
    #if isinstance(x[0], np.nan)) | (isinstance(x[1], np.nan)) else geopy.distance.geodesic(fixed_mean_center, x).km)
    return data

data_fe = calculate_distances(data_fe)
data_fe['distance_mean_log'] = np.log(data_fe['distance_mean']+1) 
data_fe['distance_mean_log_standard'] = (data_fe['distance_mean_log'] - 2.336536) / 0.690107

### Feature Engineering: age_of_building = 2023 - constructed

In [48]:
data_fe['age_of_building'] = 2023 - data_fe['constructed']

In [49]:
data_fe['age_of_building_log'] = np.log(data_fe['age_of_building']+1)

In [50]:
data_fe['age_of_building_log_standard'] = (data_fe['age_of_building_log'] - 2.570196) / 1.011501

### Feature Engineering: cat_constructed

In [51]:
constructed_mean = pd.DataFrame(data_fe.groupby("constructed", as_index=False)["price_log"].mean())
constructed_median = pd.DataFrame(data_fe.groupby("constructed", as_index=False)["price_log"].median())

In [52]:
zero = constructed_mean[constructed_mean['price_log'] < 7.0]
zero = zero['constructed'].values
zero = list(zero)

In [53]:
one = constructed_mean[constructed_mean['price_log'] < 7.25]
one = one[one['price_log'] >= 7.0]
one = one['constructed'].values
one = list(one)

In [54]:
two = constructed_mean[constructed_mean['price_log'] < 7.50]
two = two[two['price_log'] >= 7.25]
two = two['constructed'].values
two = list(two)

In [55]:
three = constructed_mean[constructed_mean['price_log'] < 7.75]
three = three[three['price_log'] >= 7.50]
three = three['constructed'].values
three = list(three)

In [56]:
four = constructed_mean[constructed_mean['price_log'] < 9.0]
four = four[four['price_log'] >= 7.75]
four = four['constructed'].values
four = list(four)

In [57]:
cleaning['cat_constructed'] = 0

In [58]:
ids = cleaning['id'].values

for ide in ids:
    build_year = cleaning.loc[(cleaning['id'] == ide), 'constructed']
    build_year = float(build_year)
    
    if build_year in zero:
        cleaning.loc[(cleaning['id'] == ide), 'cat_constructed'] = 0
    elif build_year in one:
        cleaning.loc[(cleaning['id'] == ide), 'cat_constructed'] = 1
    elif build_year in two:
        cleaning.loc[(cleaning['id'] == ide), 'cat_constructed'] = 2
    elif build_year in three:
        cleaning.loc[(cleaning['id'] == ide), 'cat_constructed'] = 3
    elif build_year in four:
        cleaning.loc[(cleaning['id'] == ide), 'cat_constructed'] = 4
    else:
        cleaning.loc[(cleaning['id'] == ide), 'cat_constructed'] = "Missing"

In [59]:
# Manually assign missing
cleaning.loc[(cleaning['constructed'] == 1875), 'cat_constructed'] = 2
cleaning.loc[(cleaning['constructed'] == 1887), 'cat_constructed'] = 2
cleaning.loc[(cleaning['constructed'] == 1892), 'cat_constructed'] = 3
cleaning.loc[(cleaning['constructed'] == 1923), 'cat_constructed'] = 2
cleaning.loc[(cleaning['constructed'] == 1945), 'cat_constructed'] = 2
cleaning.loc[(cleaning['constructed'] == 1947), 'cat_constructed'] = 2

In [60]:
cleaning['build_year'] = cleaning['cat_constructed']

### Feature Engineering: floor_perc = floor / stories

In [61]:
data_fe['floor_perc'] = data_fe['floor'] / data_fe['stories']

In [62]:
cleaning_df = data_fe[data_fe['floor_perc'] == 1]
#cleaning_df[cleaning_df['price'].isnull()]
#print(cleaning_df['building_id'].value_counts())
#cleaning_df[cleaning_df['building_id'] == 5663]

ids = cleaning_df['id'].values

for ide in ids:
    cleaning.loc[(cleaning['id'] == ide), 'floor_perc'] = 0.5

In [63]:
data_fe['floor_perc_log'] = np.log(data_fe['floor_perc'] + 1) 

### Feature Engineering: ceiling

In [68]:
data_fe.loc[2494, 'ceiling'] = 3.0
data_fe.loc[13256, 'ceiling'] = 3.0
data_fe.loc[6090, 'ceiling'] = 2.65

### Feature Engineering: distance_to_metro

In [69]:
data_metros = pd.read_csv('data/moscow_metro_stations.csv', encoding='cp1252')

def create_metro_tuples_column(data_metros):
    metro_tuples_list = []
    for row in data_metros.itertuples(): 
        metro_tuples_list.append((row.latitude, row.longitude))
    data_metros['metro_tuple'] = metro_tuples_list
    return data_metros['metro_tuple'] 

create_metro_tuples_column(data_metros)

0        (55.8148, 37.7342)
1        (55.8038, 37.7448)
2        (55.7963, 37.7151)
3        (55.7888, 37.6802)
4        (55.7801, 37.6673)
               ...         
237    (55.70497, 37.81901)
238      (55.7033, 37.8511)
239        (55.71, 37.8792)
240      (55.7085, 37.9004)
241      (55.7036, 37.9264)
Name: metro_tuple, Length: 242, dtype: object

In [70]:
#data_metros = data_metros.drop(242)
#data_metros

In [71]:
def calculate_distance_metro(data_metros, data):
    metro_tuples = data_metros['metro_tuple']
    distance = 999
    distances_to_metro = []
    for point in data['point_tuples']:
        list_of_distances = metro_tuples.apply(lambda x: geopy.distance.geodesic(point, x).km)
        distances_to_metro.append(min(list_of_distances))
    data['distance_to_metro'] = distances_to_metro
    return data['distance_to_metro']

In [72]:
#calculate_distance_metro(data_metros, data_fe)

In [73]:
calculated_distances_metro = pd.read_csv('data/distance_to_metro.csv')
data_fe = pd.merge(data_fe, calculated_distances_metro.set_index('id'), how='left', left_on='id', right_index=True)

In [74]:
data_fe['distance_to_metro']

0        1.923224
1        0.914736
2        1.644991
3        0.914736
4        1.221645
          ...    
9932     2.152614
9933     0.632797
9934     2.582979
9935     0.722539
9936    13.446798
Name: distance_to_metro, Length: 33219, dtype: float64

In [75]:
data_fe['distance_to_metro'] = np.log(data_fe['distance_to_metro'])

### Feature Cleaning

In [76]:
data_fe['new'] = pd.to_numeric(data_fe['new'])
data_fe['cat_constructed'] = pd.to_numeric(data_fe['cat_constructed'])

In [77]:
data_fe['material'] = data_fe['material'].astype(int)

In [78]:
data_fe['condition'] = data_fe['condition'].astype(int)

In [79]:
data_fe['latitude_squared'] = data_fe['latitude']**2
data_fe['longitude_squared'] = data_fe['longitude']**2

In [80]:
data_fe['room_size_log_squared'] = data_fe['room_size_log']**2

### Feature Engineering: distances to universities

In [85]:
data_uni = pd.read_csv('data/universities_moscow.csv', encoding='cp1252')

def create_uni_tuples_column(data_uni):
    uni_tuples_list = []
    for row in data_uni.itertuples(): 
        uni_tuples_list.append((row.latitude, row.longitude))
    data_uni['lat_long_tuple'] = uni_tuples_list
    return data_uni['lat_long_tuple'] 

create_uni_tuples_column(data_uni)

0    (55.703819, 37.528401)
1    (55.766022, 37.684533)
2    (55.929809, 37.520234)
3    (55.761435, 37.633197)
4     (55.64983, 37.664436)
5    (55.672896, 37.484679)
6    (55.741182, 37.619337)
7    (55.727818, 37.627432)
Name: lat_long_tuple, dtype: object

In [86]:
def calculate_distance_uni(data_uni, data):
    uni_tuples = data_uni['lat_long_tuple']
    distances_to_uni = []
    for point in data['point_tuples']:
        list_of_distances = uni_tuples.apply(lambda x: geopy.distance.geodesic(point, x).km)
        distances_to_uni.append(min(list_of_distances))
    data['distance_to_uni'] = distances_to_uni
    return data['distance_to_uni']

In [87]:
calculate_distance_uni(data_uni, data_fe);

In [88]:
data_fe['distance_to_uni'] = np.log(data_fe['distance_to_uni'])

In [89]:
data_fe['distance_to_uni'].isnull().sum()

0

### Feature Engineering: Bearing to center

In [94]:
from numpy import arctan2, random, sin, cos, degrees

def calc_bearing(data):
  center = (mean_center[1], mean_center[0]) # (55, 37) lat, long
  bearing_list = []
  for point in data.point_tuples:
    # delta_L is the difference between the Longitudal values of the two points
    dLong = center[1] - point[1]
    x = cos(center[0])*sin(dLong)
    y = cos(point[0])*sin(center[0]) - sin(point[0])*cos(center[0])*cos(dLong)
    bearing = degrees(arctan2(x, y))
    bearing_list.append(bearing)
  data['bearing_to_center'] = bearing_list
  return data['bearing_to_center']

calc_bearing(data_fe);

# 3. Prediction

### Train/test split

In [148]:
# categorical lgbm: 'material', 'condition', 'cat_constructed' => must be floats
# categorical cb: 'floor', 'rooms', 'condition', 'district', 'stories', 'material', 'cat_constructed' => must be strings
data_fe.cat_constructed = data_fe.cat_constructed.astype(np.float64)
convert_types=['material', 'condition', 'cat_constructed', 'floor', 'rooms',  'district', 'stories']
for col in convert_types:
    data_fe[col + '_string'] = data_fe[col].astype(str)

In [149]:
train = data_fe[data_fe['price'].notnull()]
test = data_fe[data_fe['price'].isnull()]
train['price_log'] = np.log(train['price'])
group = train.building_id

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['price_log'] = np.log(train['price'])


### Features

In [96]:
categorical_features = [0, 1, 2]

cb_cat_features = ['floor', 'rooms', 'condition', 'district', 'stories', 'material']

xgb_features = ['longitude', 'latitude', 'floor_perc', 'condition', 'material', 
                 'ceiling',  'distance_to_uni', 'area_total', 'distance_mean', 'room_size', 'cat_constructed']


catb_features = [ 'age_of_building', 'area_total', 'longitude', 'latitude',
                 'ceiling', 'room_size', 'distance_to_uni', 'distance_mean', 'distance_to_metro',
                    'floor', 'rooms', 'condition', 'district', 'stories', 'material', 'cat_constructed']

lgbm_features = ['material', 'condition', 'constructed', 'floor_perc',
            'area_total',  'ceiling', 'room_size', 'distance_to_uni',
            'distance_mean', 'longitude', 'latitude']

rf_features = ['material', 'condition', 'cat_constructed', 'floor_perc',  'age_of_building_log',
            'area_total_log',  'ceiling', 'room_size_log', 'distance_to_uni',
            'distance_mean_log', 'longitude', 'latitude']

### XGBoost

In [97]:
convert_types = ['seller', 'floor', 'rooms', 'bathrooms', 'balc_loggias', 'condition', 
                 'phones', 'new', 'district', 'stories', 'elevator_without', 'material', 'cat_constructed']
for col in convert_types:
    train[col] = train[col].astype(float) 
    test[col] = test[col].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[col] = train[col].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(float)


In [100]:
xgb = xgb.XGBRegressor(    
    metric= 'rmsle',
    #tree_method= 'gpu_hist',
    n_estimators= 6000,
    reg_lambda= 3.3970862089233713,
    reg_alpha= 0.05,
    colsample_bytree= 0.9,
    subsample= 0.7,
    learning_rate= 0.0086,
    max_depth= 11,
    random_state= 48,
    min_child_weight= 1,
    seed= 42
)

xgb.fit(train[xgb_features], train['price_log'])

Parameters: { "metric" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.0086, max_delta_step=0,
             max_depth=11, metric='rmsle', min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=6000, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=48,
             reg_alpha=0.05, reg_lambda=3.3970862089233713, scale_pos_weight=1,
             seed=42, subsample=0.7, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [101]:
xgb_pred = xgb.predict(test[xgb_features])

### CatBoost

In [102]:
convert_types = ['seller', 'floor', 'rooms', 'bathrooms', 'balc_loggias', 'condition', 'constructed', 
                 'phones', 'new', 'district', 'stories', 'elevator_without', 'material', 'cat_constructed']
for col in convert_types:
    train[col] = train[col].astype(str) 
    test[col] = test[col].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[col] = train[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(str)


In [103]:
best_trial_cb={
    'l2_leaf_reg': 0.0029675173022904867,
    'max_bin': 389,
    'bagging_temperature': 0.7448407730182606,
    'learning_rate': 0.05226579708274029,
    'depth': 11,
    'min_data_in_leaf': 88,
    #'rsm': 1
}

model_cb = cb.CatBoostRegressor(**best_trial_cb)

model_cb.fit(train[catb_features], train['price_log'], cat_features=cb_cat_features )

0:	learn: 0.8235188	total: 292ms	remaining: 4m 51s
1:	learn: 0.7876871	total: 430ms	remaining: 3m 34s
2:	learn: 0.7530108	total: 562ms	remaining: 3m 6s
3:	learn: 0.7207778	total: 660ms	remaining: 2m 44s
4:	learn: 0.6901308	total: 797ms	remaining: 2m 38s
5:	learn: 0.6608522	total: 930ms	remaining: 2m 34s
6:	learn: 0.6332870	total: 1.06s	remaining: 2m 30s
7:	learn: 0.6070752	total: 1.19s	remaining: 2m 27s
8:	learn: 0.5821676	total: 1.32s	remaining: 2m 25s
9:	learn: 0.5588884	total: 1.45s	remaining: 2m 23s
10:	learn: 0.5368302	total: 1.58s	remaining: 2m 22s
11:	learn: 0.5162398	total: 1.72s	remaining: 2m 21s
12:	learn: 0.4967104	total: 1.85s	remaining: 2m 20s
13:	learn: 0.4782592	total: 1.98s	remaining: 2m 19s
14:	learn: 0.4603992	total: 2.12s	remaining: 2m 19s
15:	learn: 0.4440629	total: 2.24s	remaining: 2m 17s
16:	learn: 0.4284932	total: 2.37s	remaining: 2m 17s
17:	learn: 0.4136223	total: 2.5s	remaining: 2m 16s
18:	learn: 0.3996060	total: 2.64s	remaining: 2m 16s
19:	learn: 0.3860516	tot

<catboost.core.CatBoostRegressor at 0x1cb091a55e0>

In [104]:
cb_pred = model_cb.predict(test[catb_features])

### LightGBM V1

In [106]:
convert_types = ['seller', 'floor', 'rooms', 'bathrooms', 'balc_loggias', 'condition', 'constructed',
                 'phones', 'new', 'district', 'stories', 'elevator_without', 'material', 'cat_constructed']
for col in convert_types:
    train[col] = train[col].astype(float) 
    test[col] = test[col].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[col] = train[col].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[col] = test[col].astype(float)


In [107]:
RS = 20170501
np.random.seed(RS)
ROUNDS = 450

model = lgb.LGBMRegressor(
    objective ='regression',
    metric = 'rmsle',
    boosting = 'gbdt',
    num_leaves = 60, 
    max_depth = 12, 
    min_child_samples = 4, 
    max_bin = 1000,
    learning_rate = 0.032543242172229245, 
    verbose = -1,
    bagging_fraction = 0.7559447209589375, 
    bagging_freq = 3, 
    bagging_seed = 42,
    feature_fraction = 0.6697712968829559, 
    feature_fraction_seed = 42,
    num_iterations = 2400, 
    n_estimators = 10000,
    lambda_l1 = 1.5680852745891572e-06, 
    lambda_l2 = 5.5199200274889796e-05,
    categorical_features = categorical_features
)

model.fit(train[lgbm_features], train['price_log'])



LGBMRegressor(bagging_fraction=0.7559447209589375, bagging_freq=3,
              bagging_seed=42, boosting='gbdt', categorical_features=[0, 1, 2],
              feature_fraction=0.6697712968829559, feature_fraction_seed=42,
              lambda_l1=1.5680852745891572e-06,
              lambda_l2=5.5199200274889796e-05,
              learning_rate=0.032543242172229245, max_bin=1000, max_depth=12,
              metric='rmsle', min_child_samples=4, n_estimators=10000,
              num_iterations=2400, num_leaves=60, objective='regression',
              verbose=-1)

In [108]:
lgbm_preds = model.predict(test[lgbm_features])

### META model (RF, XGB, LGBM, LGBM_V2 & CB)  

In [201]:
# LightGBM 
lgb_features = ['material', 'condition', 'constructed', 'longitude', 'latitude', 'floor_perc',
            'area_total', 'ceiling', 'room_size', 'distance_to_uni', 'distance_mean']

lgb_cat_features = [0, 1, 2]

# LightGBM V2 
lgb_v2_features = ['material', 'condition',  'constructed', 'stories', 'floor', 'rooms',
             'ceiling',  'distance_to_uni', 
            'distance_mean_log', 'area_total_log', 'bearing_to_center']

lgb_v2_cat_features = [0, 1, 2, 4, 5]

# XGBoost 
xgb_features = ['longitude', 'latitude', 'floor_perc', 'condition',  'material', 'ceiling', 'distance_to_uni',
                  'area_total', 'distance_mean', 'room_size', 'cat_constructed']

# CatBoost 
cb_features = ['longitude', 'latitude', 'age_of_building', 'area_total', 'ceiling', 'room_size',
                'distance_to_uni', 'distance_mean', 'distance_to_metro', 'floor_string', 'rooms_string',
                   'condition_string', 'district_string', 'stories_string', 'material_string', 'cat_constructed_string']

cb_cat_features = [9, 10, 11, 12, 13, 14, 15]

# Random Forest
rf_features = ['material', 'condition', 'cat_constructed', 'floor_perc', 'age_of_building_log',
            'area_total_log',  'ceiling', 'room_size_log', 'distance_to_uni', 'distance_mean_log', 'longitude', 'latitude']

In [253]:
model_lgb = lgb.LGBMRegressor(
    objective ='regression',
    metric = 'rmsle',
    boosting = 'gbdt',
    num_leaves = 60, 
    max_depth = 12, 
    min_child_samples = 4, 
    max_bin = 1000,
    learning_rate = 0.032543242172229245, 
    verbose = -1,
    bagging_fraction = 0.7559447209589375, 
    bagging_freq = 3, 
    bagging_seed = 42,
    feature_fraction = 0.6697712968829559, 
    feature_fraction_seed = 42,
    num_iterations = 2400, 
    n_estimators = 10000,
    lambda_l1 = 1.5680852745891572e-06, 
    lambda_l2 = 5.5199200274889796e-05,
    categorical_features = lgb_cat_features
)

model_xgb = xgb.XGBRegressor(    
    metric = 'rmsle',
    n_estimators = 6000,
    reg_lambda = 3.3970862089233713,
    reg_alpha = 0.05,
    colsample_bytree = 0.9,
    subsample = 0.7,
    learning_rate = 0.0086,
    max_depth = 11,
    random_state = 48,
    min_child_weight = 1,
    seed = 42
)

model_cb = cb.CatBoostRegressor(
    l2_leaf_reg = 0.0029675173022904867,
    max_bin = 389,
    bagging_temperature = 0.7448407730182606,
    learning_rate = 0.05226579708274029,
    depth = 11,
    min_data_in_leaf = 88,
    #'rsm': 1
    cat_features = cb_cat_features
)

model_rf = RandomForestRegressor(
    n_estimators=2000,
    random_state=42,
    bootstrap=False,
    max_features='sqrt'
)

model_lgb_v2 = lgb.LGBMRegressor(
    objective='regression',
    metric= 'rmsle',
    boosting= 'gbdt',
    num_leaves= 60, 
    max_depth= 10, 
    min_child_samples= 9, 
    max_bin= 1000,
    learning_rate= 0.032543242172229245, 
    verbose= -1,
    bagging_fraction= 0.6244885179807813, 
    bagging_freq= 6, 
    bagging_seed= 42,
    feature_fraction= 0.6293142647827907, 
    feature_fraction_seed= 42,
    num_iterations= 2000, 
    n_estimators= 10000,
    lambda_l1= 2.4308837323197616e-05, 
    lambda_l2= 0.0005, 
    categorical_features = lgb_v2_cat_features
)

In [186]:
n_train = train.shape[0]
n_test = test.shape[0]

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42) 

def get_oof(reg_model, x_train, y_train, x_test):
    
    oof_train = np.zeros((n_train,))
    oof_test = np.zeros((n_test,))
    oof_test_skf = np.empty((n_folds, n_test))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):  
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        reg_model.fit(x_tr, y_tr)

        oof_train[test_index] = reg_model.predict(x_te)
        oof_test_skf[i, :] = reg_model.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [187]:
y_train = train.price_log.ravel()

cb_oof_train, cb_oof_test = get_oof(model_cb, train[cb_features].values, y_train, test[cb_features].values)

0:	learn: 0.8247155	total: 488ms	remaining: 8m 7s
1:	learn: 0.7879733	total: 916ms	remaining: 7m 37s
2:	learn: 0.7544579	total: 1.05s	remaining: 5m 50s
3:	learn: 0.7218925	total: 1.57s	remaining: 6m 30s
4:	learn: 0.6903126	total: 2.04s	remaining: 6m 45s
5:	learn: 0.6610090	total: 2.59s	remaining: 7m 9s
6:	learn: 0.6334319	total: 3.16s	remaining: 7m 28s
7:	learn: 0.6068222	total: 3.58s	remaining: 7m 24s
8:	learn: 0.5821205	total: 4.17s	remaining: 7m 38s
9:	learn: 0.5589043	total: 4.67s	remaining: 7m 42s
10:	learn: 0.5369820	total: 5.16s	remaining: 7m 43s
11:	learn: 0.5160401	total: 5.76s	remaining: 7m 54s
12:	learn: 0.4965700	total: 6.45s	remaining: 8m 9s
13:	learn: 0.4778517	total: 7.08s	remaining: 8m 18s
14:	learn: 0.4602245	total: 7.72s	remaining: 8m 26s
15:	learn: 0.4430312	total: 8.24s	remaining: 8m 26s
16:	learn: 0.4274248	total: 8.78s	remaining: 8m 27s
17:	learn: 0.4127542	total: 9.47s	remaining: 8m 36s
18:	learn: 0.3993058	total: 10s	remaining: 8m 36s
19:	learn: 0.3862022	total:

In [188]:
lgb_oof_train, lgb_oof_test = get_oof(model_lgb, train[lgb_features].values, y_train, test[lgb_features].values)









































In [189]:
xgb_oof_train, xgb_oof_test = get_oof(model_xgb, train[xgb_features].values, y_train, test[xgb_features].values)

Parameters: { "metric" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "metric" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "metric" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "metric" } might not be used.

  This could be 

In [192]:
rf_oof_train, rf_oof_test = get_oof(model_rf, train[rf_features].values, y_train, test[rf_features].values)

In [203]:
lgb_v2_oof_train, lgb_v2_oof_test = get_oof(model_lgb_v2, train[lgb_v2_features].values, y_train, test[lgb_v2_features].values)









































In [204]:
x_train = np.concatenate((
    lgb_oof_train,
    xgb_oof_train,
    cb_oof_train,
    rf_oof_train,
    lgb_v2_oof_train
), axis=1)

x_test = np.concatenate((
    lgb_oof_test,
    xgb_oof_test,
    cb_oof_test,
    rf_oof_test,
    lgb_v2_oof_test
), axis=1)

In [207]:
meta_model = lgb.LGBMRegressor( 
    num_leaves=5,
    max_depth=7, 
    random_state=42, 
    silent=True, 
    metric='rmsle',
    n_jobs=4, 
    n_estimators=4000,
    colsample_bytree=1,
    subsample=0.9,
    learning_rate=0.05
)

meta_model.fit(x_train, y_train)
meta_model_train_pred = np.exp(meta_model.predict(x_train))
meta_model_pred = np.exp(meta_model.predict(x_test))



In [271]:
sub4 = pd.DataFrame()
sub4['id'] = test['id']
sub4['price_prediction'] =  meta_model_pred*0.5 + np.exp(xgb_pred)*0.2 + np.exp(cb_pred)*0.2 + np.exp(lgbm_preds)*0.1
sub4

Unnamed: 0,id,price_prediction
0,23285,2.997069e+07
1,23286,9.910036e+06
2,23287,6.151253e+06
3,23288,8.572743e+06
4,23289,5.297835e+06
...,...,...
9932,33217,2.878670e+07
9933,33218,2.073239e+07
9934,33219,9.484344e+06
9935,33220,9.516094e+06


In [272]:
sub4.to_csv('submission.csv', index=False)