## 0. Libraries

In [146]:
import numpy as np
import pandas as pd
import json

import missingno as msno

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder

from functools import reduce

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

## 1. Process data

In [147]:
class ProcessData:
    
    def __init__(self, train: pd.DataFrame, test: pd.DataFrame):
        self.train = train
        self.test = test
        self.categorical = None
        self.numerical = None
        self.target = None
        
        
    def handle_na(self, columns: list, type_list: list):
        
        for col, tp in zip(columns, type_list):
            for df in [self.train, self.test]:
                if tp == 'cat':
                    df[col] = df[col].fillna('missing')
                else:
                    df[col] = df[col].fillna(df[col].mode().values[0])
                    df[col] = df[col].replace(r'\.0$', '', regex=True)
                    df[col] = df[col].astype(np.int64)
    
    def replace_cat(self, columns: list, type_list: list, replace_cat_dict: dict):
        
        for col, tp in zip(columns, type_list):
            for df in [self.train, self.test]:
                if tp == 'cat':
                    df[col] = df[col].replace(replace_cat_dict)
                else:
                    df[col] = df[col].replace(replace_cat_dict)
                    df[col] = df[col].replace(r'\.0$', '', regex=True)
    
    
    def parse_date(self, date_column):
        
        for df in [self.train, self.test]:
            df[date_column] = pd.to_datetime(df[date_column])
            df['month'] = df[date_column].dt.month
            df['dayofmonth'] = df[date_column].dt.day
            df['dayofweek'] = df[date_column].dt.dayofweek
            df['dayofyear'] = df[date_column].dt.dayofyear

    def encode_and_scale(self, 
                         ordinal_columns: list = None, 
                         scale_columns: list = None,
                         log_columns: list = None,
                         one_hot_columns: list = None,
                         scale='standard'):
        
        get_train = []
        get_test = []
        
        
        if ordinal_columns:
            # Encode ordinal features.
            ord_ = OrdinalEncoder(handle_unknown='use_encoded_value',
                                 unknown_value=-1)
            ord_.fit(self.train[ordinal_columns])

            ord_train = pd.DataFrame(ord_.transform(
                self.train[ordinal_columns]), columns=ordinal_columns)
            ord_test = pd.DataFrame(ord_.transform(
                self.test[ordinal_columns]), columns=ordinal_columns)
            
            get_train.append(ord_train)
            get_test.append(ord_test)
        
        if one_hot_columns:
            # Encode categorical features.
            ohe_ = OneHotEncoder()
            ohe_.fit(self.train[one_hot_columns])

            ohe_train = pd.DataFrame(ohe_.transform(self.train[one_hot_columns]).toarray())
            ohe_test = pd.DataFrame(ohe_.transform(self.test[one_hot_columns]).toarray())
            
            get_train.append(ohe_train)
            get_test.append(ohe_test)
            
        if scale_columns:
            # Rescale numerical features.
            if scale == 'standard':
                sc = StandardScaler()
            else:
                sc = MinMaxScaler(feature_range=(0, 1))
            scale_train = pd.DataFrame(sc.fit_transform(self.train[scale_columns]),
                                       columns=scale_columns)
            scale_test = pd.DataFrame(sc.fit_transform(self.test[scale_columns]),
                                      columns=scale_columns)
            
            get_train.append(scale_train)
            get_test.append(scale_test)
            
        if log_columns:
            log_train = np.log(self.train)
            log_test = np.log(self.test)
            
            get_train.append(log_train)
            get_test.append(log_test)
        
        
        # Combine DataFrames.
        self.train = reduce(lambda x, y: pd.concat([x, y], axis=1), get_train)
        
        self.test = reduce(lambda x, y: pd.concat([x, y], axis=1), get_test)
        
    
    def column_types(self, target=None, 
                     extend_cat=None, extend_num=None,
                     remove_cat=None, remove_num=None):
        # Numerical features.
        self.numerical = [f for f in self.train.columns if 
                          self.train.dtypes[f] != 'object']
        # Categorical features.
        self.categorical = [f for f in self.train.columns if
                            self.train.dtypes[f] == 'object']
        # Remove target
        self.target = self.train[[target]]
        self.numerical.remove(target)

        # Extend list of column names.
        if extend_cat:
            self.categorical.extend(extend_cat)
        if extend_num:
            self.numerical.extend(extend_num)
        
        # Delete specified column names form list.
        if remove_cat:
            self.categorical = [f for f in self.categorical if
                                f not in remove_cat]
        if remove_num:
            self.numerical = [f for f in self.numerical if 
                              f not in remove_num]


Load data

In [148]:
# Load dicatioanry for the 'floor' column.
with open('data/data.json') as json_file:
    floor_dict = json.load(json_file)
# Load data.
train = pd.read_csv('data/train.csv', low_memory=False)
test = pd.read_csv('data/test.csv', low_memory=False)

In [149]:
# Coluns with mmissing values.
na_cols = train.isnull().sum()[train.isnull().sum()>0].index.to_list()
# Types of columns containing mmissing values.
type_list = ['num', 'num', 'num', 'num', 'num', 'num', 'num', 'num', 'cat']

Apply data processing class

In [150]:
# Processed data instance.
data = ProcessData(train, test)

In [151]:
# Clean 'floor' column data.
data.replace_cat(columns=['floor'], replace_cat_dict=floor_dict, type_list=['cat'])

In [152]:
# Fill in missing values.
data.handle_na(na_cols, type_list)

In [153]:
# Add date-related columns.
data.parse_date('date')

In [154]:
target = 'per_square_meter_price'

extend_cat = ['realty_type']
remove_cat = ['id', 'date']

remove_num = ['date', 'realty_type']

# Get column types.
data.column_types(target=target, extend_cat=extend_cat,
                  remove_cat=remove_cat, remove_num=remove_num)

In [155]:
data.train.columns

Index(['city', 'floor', 'id', 'lat', 'lng', 'osm_amenity_points_in_0.001',
       'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075',
       'osm_amenity_points_in_0.01', 'osm_building_points_in_0.001',
       'osm_building_points_in_0.005', 'osm_building_points_in_0.0075',
       'osm_building_points_in_0.01', 'osm_catering_points_in_0.001',
       'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075',
       'osm_catering_points_in_0.01', 'osm_city_closest_dist',
       'osm_city_nearest_name', 'osm_city_nearest_population',
       'osm_crossing_closest_dist', 'osm_crossing_points_in_0.001',
       'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075',
       'osm_crossing_points_in_0.01', 'osm_culture_points_in_0.001',
       'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075',
       'osm_culture_points_in_0.01', 'osm_finance_points_in_0.001',
       'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075',
       'osm_finance_points_in

1. ordinal_columns - `OrdinalEncoder`, 
2. scale_columns - если `scale='standard'`, то `StandardScaler`, в противном случае `MinMaxScaler`
3. log_columns - `np.log()`,
4. one_hot_column - `OneHotEncoder`

In [156]:
# Encode and rescale data
data.encode_and_scale(one_hot_columns=['region'],
                      ordinal_columns=[x for x in data.categorical if x!='region'],
                      scale_columns=data.numerical,)

In [157]:
train_to_save = pd.concat([data.train, data.target], axis=1)

In [161]:
train_to_save.to_csv('data_cleaned_3.csv', index=False)

In [162]:
train_to_save.to_csv('train_cleaned_2.csv', index=False)

In [163]:
data.test.to_csv('test_cleaned_2.csv', index=False)