### Outline

This Ipython notebook is focused on data preparation : fill missing values and multicollinearity in dataset

   * Missing Value 
   * Outlier 
   * Multicollinearity 
   * Simple Submission 

### Setup Imports and Variables

In [1]:
import pandas as pd
import numpy as np
import pylab as plt
import matplotlib.pyplot as plt
##import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import numpy as np
import seaborn as sns
from scipy import stats
color = sns.color_palette()
%matplotlib inline
# Set the global default size of matplotlib figures
plt.rc('figure', figsize=(12, 5))

# Size of matplotlib figures that contain subplots
fizsize_with_subplots = (12,12)

# Size of matplotlib histogram bins
bin_size = 10

### Load the Data

In [2]:
df_train = pd.read_csv('./data/input/train.csv')
df_train.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,1,2011-08-20,43,27.0,4.0,,,,,,...,9,4,0,13,22,1,0,52,4,5850000
1,2,2011-08-23,34,19.0,3.0,,,,,,...,15,3,0,15,29,1,10,66,14,6000000
2,3,2011-08-27,43,29.0,2.0,,,,,,...,10,3,0,11,27,0,4,67,10,5700000
3,4,2011-09-01,89,50.0,9.0,,,,,,...,11,2,1,4,4,0,0,26,3,13100000
4,5,2011-09-05,77,77.0,4.0,,,,,,...,319,108,17,135,236,2,91,195,14,16331452


In [16]:
df_train.shape

(30471, 292)

In [3]:
df_test = pd.read_csv('./data/input/test.csv')
df_test.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_1500,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000
0,30474,2015-07-01,39.0,20.7,2,9,1,1998.0,1,8.9,...,8,0,0,0,1,10,1,0,14,1
1,30475,2015-07-01,79.2,,8,17,1,0.0,3,1.0,...,4,1,1,0,2,11,0,1,12,1
2,30476,2015-07-01,40.5,25.1,3,5,2,1960.0,2,4.8,...,42,11,4,0,10,21,0,10,71,11
3,30477,2015-07-01,62.8,36.0,17,17,1,2016.0,2,62.8,...,1,1,2,0,0,10,0,0,2,0
4,30478,2015-07-01,40.0,40.0,17,17,1,0.0,1,1.0,...,5,1,1,0,2,12,0,1,11,1


In [4]:
#basic variables
basic_feature = ['timestamp', 'full_sq', 'life_sq', 'floor', 'max_floor', 'material', 'build_year',
                 'num_room', 'kich_sq', 'state','product_type', 'sub_area']

In [10]:
df_train['build_year'].describe(include='all')

count    1.686600e+04
mean     3.068057e+03
std      1.543878e+05
min      0.000000e+00
25%      1.967000e+03
50%      1.979000e+03
75%      2.005000e+03
max      2.005201e+07
Name: build_year, dtype: float64

### Missing Value

In [13]:
#missing value
missing_build_count = df_train[df_train['build_count_1921-1945'].isnull()]
missing_build_count.shape

(4991, 292)

In [14]:
missing_df = missing_build_count.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df = missing_df.ix[missing_df['missing_count']>0]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  This is separate from the ipykernel package so we can avoid doing imports until


In [52]:
missing_df.head()

Unnamed: 0,column_name,missing_count
3,life_sq,2698
4,floor,37
5,max_floor,1721
6,material,1721
7,build_year,3424


In [17]:
df_train['cafe_sum_500_max_price_avg'].describe()

count    17190.000000
mean      1247.023497
std        526.539159
min        500.000000
25%       1000.000000
50%       1166.670000
75%       1500.000000
max       6000.000000
Name: cafe_sum_500_max_price_avg, dtype: float64

In [37]:
df_train['state'].describe()

count    16912.000000
mean         2.107025
std          0.880148
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
max         33.000000
Name: state, dtype: float64

In [35]:
#Summary clean missing value methods
##method 1
### drop missing value >0.35 of total observations, keep state and build_year for training data
def drop_high_volumn_missing(df):
    not_move = ["build_year",'state']
    high_missing__feaure= df.columns[df.isnull().sum()>0.35*len(df)].tolist()
    for e in high_missing_train_feature:
        for g in not_move:
            if e==g:
                high_missing_train_feature.remove(e)
    return high_missing_train_feature

In [36]:
drop_high_volumn_missing(df_train)

['hospital_beds_raion',
 'cafe_sum_500_min_price_avg',
 'cafe_sum_500_max_price_avg',
 'cafe_avg_price_500']

In [48]:
## method 2 fill cafe_sum missing with 0; fill state with 0
def fill_missing_zero(df):
    df.filter(regex=r'^cafe_', axis=1).fillna(0, inplace=True)
    df['state'].fillna(0,inplace=True)
    return df

In [43]:
df_train.filter(regex=r'^cafe_', axis=1).head()

Unnamed: 0,cafe_count_500,cafe_sum_500_min_price_avg,cafe_sum_500_max_price_avg,cafe_avg_price_500,cafe_count_500_na_price,cafe_count_500_price_500,cafe_count_500_price_1000,cafe_count_500_price_1500,cafe_count_500_price_2500,cafe_count_500_price_4000,...,cafe_sum_5000_min_price_avg,cafe_sum_5000_max_price_avg,cafe_avg_price_5000,cafe_count_5000_na_price,cafe_count_5000_price_500,cafe_count_5000_price_1000,cafe_count_5000_price_1500,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high
0,0,,,,0,0,0,0,0,0,...,708.57,1185.71,947.14,12,39,48,40,9,4,0
1,5,860.0,1500.0,1180.0,0,1,3,0,0,1,...,673.81,1148.81,911.31,9,49,65,36,15,3,0
2,3,666.67,1166.67,916.67,0,0,2,1,0,0,...,702.68,1196.43,949.55,10,29,45,25,10,3,0
3,2,1000.0,1500.0,1250.0,0,0,0,2,0,0,...,931.58,1552.63,1242.11,4,7,21,15,11,2,1
4,48,702.22,1166.67,934.44,3,17,10,11,7,0,...,853.88,1411.45,1132.66,143,566,578,552,319,108,17


Cafe_count = 0 then cafe_*_price is missing

In [80]:
## drop build_ columns: drop na obs or drop columns
def drop_build_function(df):
    not_move = ["build_year"]
    build_feature = df.filter(regex=r'^build_count_', axis=1).columns.tolist()
    build_feature2 = df.filter(regex=r'^raion_build_count_', axis=1).columns.tolist()
    for e in build_feature:
        for g in not_move:
            if e==g:
                high_missing_train_feature.remove(e)
    build_feature.extend(build_feature2)
    return build_feature

In [81]:
drop_build_function(df_train)

['build_count_block',
 'build_count_wood',
 'build_count_frame',
 'build_count_brick',
 'build_count_monolith',
 'build_count_panel',
 'build_count_foam',
 'build_count_slag',
 'build_count_mix',
 'build_count_before_1920',
 'build_count_1921-1945',
 'build_count_1946-1970',
 'build_count_1971-1995',
 'build_count_after_1995',
 'raion_build_count_with_material_info',
 'raion_build_count_with_builddate_info']

### Multicollinearity

In [82]:
## clean multicolinearty using remove the minimum number of predictors to oensure that all pairwise correlations are
## below a certain threshold

def corr_df(x, corr_val):
    '''
    Obj: Drops features that are strongly correlated to other features.
          This lowers model complexity, and aids in generalizing the model.
    Inputs:
          df: features df (x)
          corr_val: Columns are dropped relative to the corr_val input (e.g. 0.8)
    Output: df that only includes uncorrelated features
    '''

    # Creates Correlation Matrix and Instantiates
    corr_matrix = x.corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []

    # Iterates through Correlation Matrix Table to find correlated columns
    for i in iters:
        for j in range(i):
            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            val = item.values
            if val >= corr_val:
                # Prints the correlated feature set and the corr val
                print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
                drop_cols.append(i)

    drops = sorted(set(drop_cols))[::-1]

    # Drops the correlated columns
    for i in drops:
        col = x.iloc[:, (i+1):(i+2)].columns.values
        df = x.drop(col, axis=1)

    return df

In [83]:
train_n_corr = corr_df(df_train,0.75)

('children_preschool', '|', 'raion_popul', '|', 0.95)
('preschool_education_centers_raion', '|', 'raion_popul', '|', 0.84)
('preschool_education_centers_raion', '|', 'children_preschool', '|', 0.85)
('children_school', '|', 'raion_popul', '|', 0.96)
('children_school', '|', 'children_preschool', '|', 0.99)
('children_school', '|', 'preschool_quota', '|', 0.81)
('school_quota', '|', 'raion_popul', '|', 0.76)
('school_quota', '|', 'children_preschool', '|', 0.85)
('school_quota', '|', 'preschool_quota', '|', 0.93)
('school_quota', '|', 'preschool_education_centers_raion', '|', 0.85)
('school_education_centers_raion', '|', 'raion_popul', '|', 0.83)
('school_education_centers_raion', '|', 'children_preschool', '|', 0.83)
('school_education_centers_raion', '|', 'preschool_education_centers_raion', '|', 0.96)
('school_education_centers_raion', '|', 'children_school', '|', 0.81)
('office_raion', '|', 'university_top_20_raion', '|', 0.76)
('office_raion', '|', 'culture_objects_top_25_raion', '

('0_13_female', '|', '0_17_female', '|', 1.0)
('0_13_female', '|', '0_13_all', '|', 1.0)
('build_count_wood', '|', 'raion_build_count_with_material_info', '|', 0.8)
('build_count_brick', '|', 'office_raion', '|', 0.76)
('build_count_panel', '|', 'raion_popul', '|', 0.85)
('build_count_panel', '|', 'children_preschool', '|', 0.84)
('build_count_panel', '|', 'preschool_quota', '|', 0.85)
('build_count_panel', '|', 'children_school', '|', 0.84)
('build_count_panel', '|', 'school_quota', '|', 0.83)
('build_count_panel', '|', 'young_all', '|', 0.84)
('build_count_panel', '|', 'young_male', '|', 0.85)
('build_count_panel', '|', 'young_female', '|', 0.84)
('build_count_panel', '|', 'work_all', '|', 0.87)
('build_count_panel', '|', 'work_male', '|', 0.86)
('build_count_panel', '|', 'work_female', '|', 0.87)
('build_count_panel', '|', '0_6_all', '|', 0.84)
('build_count_panel', '|', '0_6_male', '|', 0.84)
('build_count_panel', '|', '0_6_female', '|', 0.84)
('build_count_panel', '|', '7_14_all',

('basketball_km', '|', 'metro_min_avto', '|', 0.83)
('basketball_km', '|', 'metro_km_avto', '|', 0.87)
('basketball_km', '|', 'metro_min_walk', '|', 0.87)
('basketball_km', '|', 'metro_km_walk', '|', 0.87)
('basketball_km', '|', 'school_km', '|', 0.8)
('basketball_km', '|', 'park_km', '|', 0.86)
('basketball_km', '|', 'ttk_km', '|', 0.92)
('basketball_km', '|', 'sadovoe_km', '|', 0.9)
('basketball_km', '|', 'bulvar_ring_km', '|', 0.9)
('basketball_km', '|', 'kremlin_km', '|', 0.89)
('basketball_km', '|', 'zd_vokzaly_avto_km', '|', 0.91)
('basketball_km', '|', 'bus_terminal_avto_km', '|', 0.76)
('basketball_km', '|', 'oil_chemistry_km', '|', 0.78)
('basketball_km', '|', 'nuclear_reactor_km', '|', 0.84)
('basketball_km', '|', 'radiation_km', '|', 0.93)
('basketball_km', '|', 'power_transmission_line_km', '|', 0.86)
('basketball_km', '|', 'thermal_power_plant_km', '|', 0.92)
('basketball_km', '|', 'ts_km', '|', 0.86)
('basketball_km', '|', 'swim_pool_km', '|', 0.81)
('hospice_morgue_km', 

('church_count_500', '|', 'cafe_count_500_price_2500', '|', 0.9)
('church_count_500', '|', 'cafe_count_500_price_4000', '|', 0.81)
('green_part_1000', '|', 'green_part_500', '|', 0.77)
('office_count_1000', '|', 'culture_objects_top_25_raion', '|', 0.78)
('office_count_1000', '|', 'office_raion', '|', 0.9)
('office_count_1000', '|', 'office_count_500', '|', 0.92)
('office_count_1000', '|', 'cafe_count_500', '|', 0.89)
('office_count_1000', '|', 'cafe_count_500_na_price', '|', 0.78)
('office_count_1000', '|', 'cafe_count_500_price_500', '|', 0.86)
('office_count_1000', '|', 'cafe_count_500_price_1000', '|', 0.82)
('office_count_1000', '|', 'cafe_count_500_price_1500', '|', 0.87)
('office_count_1000', '|', 'cafe_count_500_price_2500', '|', 0.85)
('office_count_1000', '|', 'cafe_count_500_price_4000', '|', 0.79)
('office_count_1000', '|', 'big_church_count_500', '|', 0.78)
('office_count_1000', '|', 'church_count_500', '|', 0.78)
('office_sqm_1000', '|', 'office_raion', '|', 0.78)
('offic

('big_church_count_1000', '|', 'cafe_count_500_price_4000', '|', 0.83)
('big_church_count_1000', '|', 'big_church_count_500', '|', 0.93)
('big_church_count_1000', '|', 'church_count_500', '|', 0.91)
('big_church_count_1000', '|', 'office_count_1000', '|', 0.86)
('big_church_count_1000', '|', 'trc_count_1000', '|', 0.78)
('big_church_count_1000', '|', 'cafe_count_1000', '|', 0.93)
('big_church_count_1000', '|', 'cafe_count_1000_na_price', '|', 0.9)
('big_church_count_1000', '|', 'cafe_count_1000_price_500', '|', 0.93)
('big_church_count_1000', '|', 'cafe_count_1000_price_1000', '|', 0.9)
('big_church_count_1000', '|', 'cafe_count_1000_price_1500', '|', 0.93)
('big_church_count_1000', '|', 'cafe_count_1000_price_2500', '|', 0.92)
('big_church_count_1000', '|', 'cafe_count_1000_price_4000', '|', 0.91)
('church_count_1000', '|', 'culture_objects_top_25_raion', '|', 0.89)
('church_count_1000', '|', 'office_raion', '|', 0.88)
('church_count_1000', '|', 'cafe_count_500', '|', 0.91)
('church_c

('cafe_count_1500_na_price', '|', 'cafe_count_1000_na_price', '|', 0.97)
('cafe_count_1500_na_price', '|', 'cafe_count_1000_price_500', '|', 0.96)
('cafe_count_1500_na_price', '|', 'cafe_count_1000_price_1000', '|', 0.94)
('cafe_count_1500_na_price', '|', 'cafe_count_1000_price_1500', '|', 0.96)
('cafe_count_1500_na_price', '|', 'cafe_count_1000_price_2500', '|', 0.96)
('cafe_count_1500_na_price', '|', 'cafe_count_1000_price_4000', '|', 0.93)
('cafe_count_1500_na_price', '|', 'big_church_count_1000', '|', 0.92)
('cafe_count_1500_na_price', '|', 'church_count_1000', '|', 0.93)
('cafe_count_1500_na_price', '|', 'leisure_count_1000', '|', 0.9)
('cafe_count_1500_na_price', '|', 'office_count_1500', '|', 0.94)
('cafe_count_1500_na_price', '|', 'cafe_count_1500', '|', 0.98)
('cafe_count_1500_price_500', '|', 'culture_objects_top_25_raion', '|', 0.91)
('cafe_count_1500_price_500', '|', 'office_raion', '|', 0.91)
('cafe_count_1500_price_500', '|', 'office_count_500', '|', 0.81)
('cafe_count_15

('cafe_count_1500_price_4000', '|', 'cafe_count_1000_price_4000', '|', 0.99)
('cafe_count_1500_price_4000', '|', 'big_church_count_1000', '|', 0.91)
('cafe_count_1500_price_4000', '|', 'church_count_1000', '|', 0.94)
('cafe_count_1500_price_4000', '|', 'leisure_count_1000', '|', 0.91)
('cafe_count_1500_price_4000', '|', 'office_count_1500', '|', 0.9)
('cafe_count_1500_price_4000', '|', 'cafe_count_1500', '|', 0.98)
('cafe_count_1500_price_4000', '|', 'cafe_count_1500_na_price', '|', 0.95)
('cafe_count_1500_price_4000', '|', 'cafe_count_1500_price_500', '|', 0.97)
('cafe_count_1500_price_4000', '|', 'cafe_count_1500_price_1000', '|', 0.95)
('cafe_count_1500_price_4000', '|', 'cafe_count_1500_price_1500', '|', 0.97)
('cafe_count_1500_price_high', '|', 'culture_objects_top_25_raion', '|', 0.81)
('cafe_count_1500_price_high', '|', 'office_raion', '|', 0.83)
('cafe_count_1500_price_high', '|', 'office_count_500', '|', 0.77)
('cafe_count_1500_price_high', '|', 'cafe_count_500', '|', 0.87)
('

('office_count_2000', '|', 'office_count_500', '|', 0.87)
('office_count_2000', '|', 'cafe_count_500', '|', 0.93)
('office_count_2000', '|', 'cafe_count_500_na_price', '|', 0.85)
('office_count_2000', '|', 'cafe_count_500_price_500', '|', 0.9)
('office_count_2000', '|', 'cafe_count_500_price_1000', '|', 0.82)
('office_count_2000', '|', 'cafe_count_500_price_1500', '|', 0.89)
('office_count_2000', '|', 'cafe_count_500_price_2500', '|', 0.91)
('office_count_2000', '|', 'cafe_count_500_price_4000', '|', 0.84)
('office_count_2000', '|', 'big_church_count_500', '|', 0.85)
('office_count_2000', '|', 'church_count_500', '|', 0.86)
('office_count_2000', '|', 'office_count_1000', '|', 0.96)
('office_count_2000', '|', 'office_sqm_1000', '|', 0.82)
('office_count_2000', '|', 'cafe_count_1000', '|', 0.95)
('office_count_2000', '|', 'cafe_count_1000_na_price', '|', 0.92)
('office_count_2000', '|', 'cafe_count_1000_price_500', '|', 0.94)
('office_count_2000', '|', 'cafe_count_1000_price_1000', '|', 

('cafe_count_2000_price_2500', '|', 'office_count_500', '|', 0.82)
('cafe_count_2000_price_2500', '|', 'cafe_count_500', '|', 0.95)
('cafe_count_2000_price_2500', '|', 'cafe_count_500_na_price', '|', 0.9)
('cafe_count_2000_price_2500', '|', 'cafe_count_500_price_500', '|', 0.92)
('cafe_count_2000_price_2500', '|', 'cafe_count_500_price_1000', '|', 0.81)
('cafe_count_2000_price_2500', '|', 'cafe_count_500_price_1500', '|', 0.9)
('cafe_count_2000_price_2500', '|', 'cafe_count_500_price_2500', '|', 0.95)
('cafe_count_2000_price_2500', '|', 'cafe_count_500_price_4000', '|', 0.89)
('cafe_count_2000_price_2500', '|', 'cafe_count_500_price_high', '|', 0.8)
('cafe_count_2000_price_2500', '|', 'big_church_count_500', '|', 0.9)
('cafe_count_2000_price_2500', '|', 'church_count_500', '|', 0.91)
('cafe_count_2000_price_2500', '|', 'office_count_1000', '|', 0.92)
('cafe_count_2000_price_2500', '|', 'office_sqm_1000', '|', 0.76)
('cafe_count_2000_price_2500', '|', 'trc_count_1000', '|', 0.76)
('cafe

('church_count_2000', '|', 'culture_objects_top_25_raion', '|', 0.88)
('church_count_2000', '|', 'office_raion', '|', 0.9)
('church_count_2000', '|', 'office_count_500', '|', 0.79)
('church_count_2000', '|', 'cafe_count_500', '|', 0.94)
('church_count_2000', '|', 'cafe_count_500_na_price', '|', 0.91)
('church_count_2000', '|', 'cafe_count_500_price_500', '|', 0.91)
('church_count_2000', '|', 'cafe_count_500_price_1000', '|', 0.79)
('church_count_2000', '|', 'cafe_count_500_price_1500', '|', 0.88)
('church_count_2000', '|', 'cafe_count_500_price_2500', '|', 0.93)
('church_count_2000', '|', 'cafe_count_500_price_4000', '|', 0.85)
('church_count_2000', '|', 'cafe_count_500_price_high', '|', 0.75)
('church_count_2000', '|', 'big_church_count_500', '|', 0.92)
('church_count_2000', '|', 'church_count_500', '|', 0.94)
('church_count_2000', '|', 'office_count_1000', '|', 0.89)
('church_count_2000', '|', 'trc_count_1000', '|', 0.79)
('church_count_2000', '|', 'cafe_count_1000', '|', 0.97)
('chu

('office_sqm_3000', '|', 'cafe_count_1000_na_price', '|', 0.78)
('office_sqm_3000', '|', 'cafe_count_1000_price_500', '|', 0.77)
('office_sqm_3000', '|', 'cafe_count_1000_price_1000', '|', 0.81)
('office_sqm_3000', '|', 'cafe_count_1000_price_1500', '|', 0.79)
('office_sqm_3000', '|', 'cafe_count_1000_price_2500', '|', 0.77)
('office_sqm_3000', '|', 'big_church_count_1000', '|', 0.77)
('office_sqm_3000', '|', 'office_count_1500', '|', 0.89)
('office_sqm_3000', '|', 'office_sqm_1500', '|', 0.89)
('office_sqm_3000', '|', 'cafe_count_1500', '|', 0.82)
('office_sqm_3000', '|', 'cafe_count_1500_na_price', '|', 0.82)
('office_sqm_3000', '|', 'cafe_count_1500_price_500', '|', 0.81)
('office_sqm_3000', '|', 'cafe_count_1500_price_1000', '|', 0.82)
('office_sqm_3000', '|', 'cafe_count_1500_price_1500', '|', 0.81)
('office_sqm_3000', '|', 'cafe_count_1500_price_2500', '|', 0.81)
('office_sqm_3000', '|', 'cafe_count_1500_price_4000', '|', 0.76)
('office_sqm_3000', '|', 'cafe_count_1500_price_high

('cafe_count_3000_price_1500', '|', 'office_count_500', '|', 0.84)
('cafe_count_3000_price_1500', '|', 'cafe_count_500', '|', 0.94)
('cafe_count_3000_price_1500', '|', 'cafe_count_500_na_price', '|', 0.88)
('cafe_count_3000_price_1500', '|', 'cafe_count_500_price_500', '|', 0.91)
('cafe_count_3000_price_1500', '|', 'cafe_count_500_price_1000', '|', 0.82)
('cafe_count_3000_price_1500', '|', 'cafe_count_500_price_1500', '|', 0.9)
('cafe_count_3000_price_1500', '|', 'cafe_count_500_price_2500', '|', 0.93)
('cafe_count_3000_price_1500', '|', 'cafe_count_500_price_4000', '|', 0.86)
('cafe_count_3000_price_1500', '|', 'cafe_count_500_price_high', '|', 0.78)
('cafe_count_3000_price_1500', '|', 'big_church_count_500', '|', 0.88)
('cafe_count_3000_price_1500', '|', 'church_count_500', '|', 0.89)
('cafe_count_3000_price_1500', '|', 'office_count_1000', '|', 0.94)
('cafe_count_3000_price_1500', '|', 'office_sqm_1000', '|', 0.78)
('cafe_count_3000_price_1500', '|', 'trc_count_1000', '|', 0.76)
('c

('cafe_count_3000_price_high', '|', 'church_count_1000', '|', 0.82)
('cafe_count_3000_price_high', '|', 'leisure_count_1000', '|', 0.82)
('cafe_count_3000_price_high', '|', 'office_count_1500', '|', 0.91)
('cafe_count_3000_price_high', '|', 'office_sqm_1500', '|', 0.79)
('cafe_count_3000_price_high', '|', 'cafe_count_1500', '|', 0.91)
('cafe_count_3000_price_high', '|', 'cafe_count_1500_na_price', '|', 0.89)
('cafe_count_3000_price_high', '|', 'cafe_count_1500_price_500', '|', 0.9)
('cafe_count_3000_price_high', '|', 'cafe_count_1500_price_1000', '|', 0.9)
('cafe_count_3000_price_high', '|', 'cafe_count_1500_price_1500', '|', 0.9)
('cafe_count_3000_price_high', '|', 'cafe_count_1500_price_2500', '|', 0.92)
('cafe_count_3000_price_high', '|', 'cafe_count_1500_price_4000', '|', 0.89)
('cafe_count_3000_price_high', '|', 'cafe_count_1500_price_high', '|', 0.94)
('cafe_count_3000_price_high', '|', 'big_church_count_1500', '|', 0.83)
('cafe_count_3000_price_high', '|', 'church_count_1500', '

('leisure_count_3000', '|', 'culture_objects_top_25_raion', '|', 0.89)
('leisure_count_3000', '|', 'office_raion', '|', 0.92)
('leisure_count_3000', '|', 'office_count_500', '|', 0.82)
('leisure_count_3000', '|', 'cafe_count_500', '|', 0.94)
('leisure_count_3000', '|', 'cafe_count_500_na_price', '|', 0.89)
('leisure_count_3000', '|', 'cafe_count_500_price_500', '|', 0.91)
('leisure_count_3000', '|', 'cafe_count_500_price_1000', '|', 0.81)
('leisure_count_3000', '|', 'cafe_count_500_price_1500', '|', 0.89)
('leisure_count_3000', '|', 'cafe_count_500_price_2500', '|', 0.94)
('leisure_count_3000', '|', 'cafe_count_500_price_4000', '|', 0.87)
('leisure_count_3000', '|', 'cafe_count_500_price_high', '|', 0.79)
('leisure_count_3000', '|', 'big_church_count_500', '|', 0.9)
('leisure_count_3000', '|', 'church_count_500', '|', 0.91)
('leisure_count_3000', '|', 'office_count_1000', '|', 0.92)
('leisure_count_3000', '|', 'trc_count_1000', '|', 0.76)
('leisure_count_3000', '|', 'cafe_count_1000', 

('trc_sqm_5000', '|', 'sport_count_2000', '|', 0.78)
('trc_sqm_5000', '|', 'trc_count_3000', '|', 0.79)
('trc_sqm_5000', '|', 'trc_sqm_3000', '|', 0.83)
('trc_sqm_5000', '|', 'sport_count_3000', '|', 0.81)
('cafe_count_5000', '|', 'office_raion', '|', 0.9)
('cafe_count_5000', '|', 'office_count_500', '|', 0.83)
('cafe_count_5000', '|', 'cafe_count_500', '|', 0.84)
('cafe_count_5000', '|', 'cafe_count_500_price_500', '|', 0.8)
('cafe_count_5000', '|', 'cafe_count_500_price_1000', '|', 0.77)
('cafe_count_5000', '|', 'cafe_count_500_price_1500', '|', 0.82)
('cafe_count_5000', '|', 'cafe_count_500_price_2500', '|', 0.8)
('cafe_count_5000', '|', 'office_count_1000', '|', 0.91)
('cafe_count_5000', '|', 'office_sqm_1000', '|', 0.81)
('cafe_count_5000', '|', 'cafe_count_1000', '|', 0.85)
('cafe_count_5000', '|', 'cafe_count_1000_na_price', '|', 0.83)
('cafe_count_5000', '|', 'cafe_count_1000_price_500', '|', 0.82)
('cafe_count_5000', '|', 'cafe_count_1000_price_1000', '|', 0.87)
('cafe_count_5

('cafe_count_5000_price_1000', '|', 'leisure_count_3000', '|', 0.88)
('cafe_count_5000_price_1000', '|', 'sport_count_3000', '|', 0.87)
('cafe_count_5000_price_1000', '|', 'office_count_5000', '|', 0.99)
('cafe_count_5000_price_1000', '|', 'office_sqm_5000', '|', 0.96)
('cafe_count_5000_price_1000', '|', 'trc_count_5000', '|', 0.83)
('cafe_count_5000_price_1000', '|', 'cafe_count_5000', '|', 1.0)
('cafe_count_5000_price_1000', '|', 'cafe_count_5000_na_price', '|', 0.99)
('cafe_count_5000_price_1500', '|', 'office_raion', '|', 0.9)
('cafe_count_5000_price_1500', '|', 'office_count_500', '|', 0.83)
('cafe_count_5000_price_1500', '|', 'cafe_count_500', '|', 0.84)
('cafe_count_5000_price_1500', '|', 'cafe_count_500_price_500', '|', 0.8)
('cafe_count_5000_price_1500', '|', 'cafe_count_500_price_1000', '|', 0.78)
('cafe_count_5000_price_1500', '|', 'cafe_count_500_price_1500', '|', 0.83)
('cafe_count_5000_price_1500', '|', 'cafe_count_500_price_2500', '|', 0.8)
('cafe_count_5000_price_1500',

('cafe_count_5000_price_4000', '|', 'cafe_count_5000_price_1500', '|', 0.98)
('cafe_count_5000_price_high', '|', 'office_raion', '|', 0.87)
('cafe_count_5000_price_high', '|', 'office_count_500', '|', 0.81)
('cafe_count_5000_price_high', '|', 'cafe_count_500', '|', 0.81)
('cafe_count_5000_price_high', '|', 'cafe_count_500_price_500', '|', 0.77)
('cafe_count_5000_price_high', '|', 'cafe_count_500_price_1500', '|', 0.78)
('cafe_count_5000_price_high', '|', 'cafe_count_500_price_2500', '|', 0.78)
('cafe_count_5000_price_high', '|', 'office_count_1000', '|', 0.88)
('cafe_count_5000_price_high', '|', 'office_sqm_1000', '|', 0.79)
('cafe_count_5000_price_high', '|', 'cafe_count_1000', '|', 0.82)
('cafe_count_5000_price_high', '|', 'cafe_count_1000_na_price', '|', 0.8)
('cafe_count_5000_price_high', '|', 'cafe_count_1000_price_500', '|', 0.8)
('cafe_count_5000_price_high', '|', 'cafe_count_1000_price_1000', '|', 0.83)
('cafe_count_5000_price_high', '|', 'cafe_count_1000_price_1500', '|', 0.82

('leisure_count_5000', '|', 'office_raion', '|', 0.9)
('leisure_count_5000', '|', 'office_count_500', '|', 0.83)
('leisure_count_5000', '|', 'cafe_count_500', '|', 0.84)
('leisure_count_5000', '|', 'cafe_count_500_price_500', '|', 0.8)
('leisure_count_5000', '|', 'cafe_count_500_price_1000', '|', 0.77)
('leisure_count_5000', '|', 'cafe_count_500_price_1500', '|', 0.82)
('leisure_count_5000', '|', 'cafe_count_500_price_2500', '|', 0.81)
('leisure_count_5000', '|', 'office_count_1000', '|', 0.91)
('leisure_count_5000', '|', 'office_sqm_1000', '|', 0.8)
('leisure_count_5000', '|', 'cafe_count_1000', '|', 0.85)
('leisure_count_5000', '|', 'cafe_count_1000_na_price', '|', 0.83)
('leisure_count_5000', '|', 'cafe_count_1000_price_500', '|', 0.83)
('leisure_count_5000', '|', 'cafe_count_1000_price_1000', '|', 0.86)
('leisure_count_5000', '|', 'cafe_count_1000_price_1500', '|', 0.85)
('leisure_count_5000', '|', 'cafe_count_1000_price_2500', '|', 0.83)
('leisure_count_5000', '|', 'cafe_count_100

In [85]:
train_n_corr.shape

(30471, 291)