<h1 style='text-align: center'>Raifhack-DS-2021-Fall 👽</h1>

<p  style='text-align: center'>
This notebook is in <span style='color: green; font-weight: 700'>Active</span> state of development! Check out this notebook to see some updates as I update new stuff as oftern as I learn it!
<a style='font-weight:700' href='https://github.com/LilDataScientist'> Code on GitHub! </a></p>

<div style='text-align: center'>
    <img src='https://i.postimg.cc/HLF8SsJK/B041-Bank-of-the-West.jpg' width='700' />
</div>

In [692]:
import numpy as np 
import pandas as pd


# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

# Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Pipelines
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

# Helpers
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error, make_scorer, mean_absolute_error


sns.set_theme()

In [693]:
df = pd.read_csv('../input/raifhackds2021fall/data/train.csv')

<h1 style='background-color: #dae8fc; border: 1px solid #94add0; padding: 10px; font-weight: 400; text-align:center'>Quick view</h1>

In [694]:
df

<h1 style='background-color: #dae8fc; border: 1px solid #94add0; padding: 10px; font-weight: 400; text-align:center'>Describe data</h1>

In [695]:
df.describe()

<h1 style='background-color: #dae8fc; border: 1px solid #94add0; padding: 10px; font-weight: 400; text-align:center'>Missing values</h1>

In [696]:
def print_missing_values(df):
    missing_values_df = pd.DataFrame(columns=['dtype','Feature', 'Number Of Missing Values', 'Percentage of Missing values'])

    for i in df:
        if df[i].isnull().sum() != 0:
            dtype = df[i].dtype
            feature                      = i
            number_of_missing_values     = df[i].isnull().sum()
            percentage_of_missing_values = round(number_of_missing_values / len(df.index) * 100, 2)
            missing_values_df = missing_values_df.append({
                                      'dtype': dtype,
                                      'Feature': i,
                                      'Number Of Missing Values': number_of_missing_values,
                                      'Percentage of Missing values': percentage_of_missing_values
            }, ignore_index=True)

    return missing_values_df

In [697]:
print_missing_values(df)

<h1 style='background-color: #dae8fc; border: 1px solid #94add0; padding: 10px; font-weight: 400; text-align:center'>Unique values</h1>

In [698]:
def print_unique_values(df):
    unique_values_df = pd.DataFrame(columns=['Feature', 'Number of values', 'Values'])

    for col in df:
        unique_values_df = unique_values_df.append({
                                      'Feature': col,
                                      'Number of values': df[col].unique().shape[0],
                                      'Values': df[col].unique()
            }, ignore_index=True)
    
    pd.set_option('display.max_rows', len(unique_values_df))
    return unique_values_df

In [699]:
print_unique_values(df)

In [700]:
df.drop(['id', 'lat', 'lng'], axis=1, inplace=True)

<h1 style='background-color: #defcdc; border: 1px solid #a1d194; padding: 10px; font-weight: 400; text-align:center'>Location [Feature engineering]</h1>

In [701]:
is_moscow = []
is_moscow_oblast = []
is_region = []

for row in df['region']:
    if row == 'Москва':
        is_moscow.append(1)
        is_moscow_oblast.append(0)
        is_region.append(0)
    elif row == 'Московская область':
        is_moscow.append(0)
        is_moscow_oblast.append(1)
        is_region.append(0)
    else:
        is_moscow.append(0)
        is_moscow_oblast.append(0)
        is_region.append(1)
        
df['is_moscow'] = is_moscow
df['is_moscow_oblast'] = is_moscow_oblast
df['is_region'] = is_region

df.drop(['region'], axis=1, inplace=True)

In [702]:
print('Moscow: ' + str((len(df[df['is_moscow'] == 1]))))
print('Moscow oblast: ' + str((len(df[df['is_moscow_oblast'] == 1]))))
print('Region: ' + str((len(df[df['is_region'] == 1]))))

<h1 style='background-color: #defcdc; border: 1px solid #a1d194; padding: 10px; font-weight: 400; text-align:center'>CIies with > 1M people [Feature engineering]</h1>

In [703]:
cities_1m_people = ['Москва', 'Санкт-Петербург', 'Новосибирск', 'Екатеринбург', 'Казань', 'Нижний Новгород', 'Челябинск', 'Самара', 'Омск',
                   'Ростов-на-Дону', 'Уфа', 'Красноярск', 'Воронеж', 'Пермь', 'Волгоград']

is_million = []
is_not_million = []

for row in df['city']:
    if row in cities_1m_people:
        is_million.append(1)
        is_not_million.append(0)
    else:
        is_million.append(0)
        is_not_million.append(1)
        
df['is_million'] = is_million
df['is_not_million'] = is_not_million

df.drop(['city'], axis=1, inplace=True)

In [704]:
print('is_million: ' + str((len(df[df['is_million'] == 1]))))
print('is_not_million: ' + str((len(df[df['is_not_million'] == 1]))))

<h1 style='background-color: #defcdc; border: 1px solid #a1d194; padding: 10px; font-weight: 400; text-align:center'>District rate [Feature engineering]</h1>

In [705]:
district_rate = []

for row in df['realty_type']:
    if row == 110:
        district_rate.append(1)
    elif row == 100:
        district_rate.append(10)
    elif row == 10:
        district_rate.append(0)
        
df['district_rate'] = district_rate

df.drop(['realty_type'], axis=1, inplace=True)

<h1 style='background-color: #defcdc; border: 1px solid #a1d194; padding: 10px; font-weight: 400; text-align:center'>Offices [Feature engineering]</h1>

In [706]:
many_offices = []

for i in range(len(df['osm_offices_points_in_0.001'])):
    if df['osm_offices_points_in_0.001'][i] > 2 or df['osm_offices_points_in_0.005'][i] > 10 or df['osm_offices_points_in_0.0075'][i] > 15 or df['osm_offices_points_in_0.01'][i] > 20:
        many_offices.append(1)
    else:
        many_offices.append(0)
        
df['many_offices'] = many_offices

df.drop(['osm_offices_points_in_0.001', 'osm_offices_points_in_0.005', 'osm_offices_points_in_0.0075', 'osm_offices_points_in_0.01'], axis=1, inplace=True)

<h1 style='background-color: #defcdc; border: 1px solid #a1d194; padding: 10px; font-weight: 400; text-align:center'>Food [Feature engineering]</h1>

In [707]:
many_food = []

for i in range(len(df['osm_catering_points_in_0.001'])):
    if df['osm_catering_points_in_0.001'][i] > 2 or df['osm_catering_points_in_0.005'][i] > 10 or df['osm_catering_points_in_0.0075'][i] > 15 or df['osm_catering_points_in_0.01'][i] > 20:
        many_food.append(1)
    else:
        many_food.append(0)
        
df['many_food'] = many_food

df.drop(['osm_catering_points_in_0.001', 'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075', 'osm_catering_points_in_0.01'], axis=1, inplace=True)

<h1 style='background-color: #defcdc; border: 1px solid #a1d194; padding: 10px; font-weight: 400; text-align:center'>Shops [Feature engineering]</h1>

In [708]:
many_shops = []

for i in range(len(df['osm_shops_points_in_0.001'])):
    if df['osm_shops_points_in_0.001'][i] > 2 or df['osm_shops_points_in_0.005'][i] > 10 or df['osm_shops_points_in_0.0075'][i] > 15 or df['osm_shops_points_in_0.01'][i] > 20:
        many_shops.append(1)
    else:
        many_shops.append(0)
        
df['many_shops'] = many_shops

df.drop(['osm_shops_points_in_0.001', 'osm_shops_points_in_0.005', 'osm_shops_points_in_0.0075', 'osm_shops_points_in_0.01'], axis=1, inplace=True)

<h1 style='background-color: #defcdc; border: 1px solid #a1d194; padding: 10px; font-weight: 400; text-align:center'>>Financial Organizations [Feature engineering]</h1>

In [709]:
many_financial_organizations = []

for i in range(len(df['osm_finance_points_in_0.001'])):
    if df['osm_finance_points_in_0.001'][i] > 2 or df['osm_finance_points_in_0.005'][i] > 10 or df['osm_finance_points_in_0.0075'][i] > 15 or df['osm_finance_points_in_0.01'][i] > 20:
        many_financial_organizations.append(1)
    else:
        many_financial_organizations.append(0)
        
df['many_financial_organizations'] = many_financial_organizations

df.drop(['osm_finance_points_in_0.001', 'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075', 'osm_finance_points_in_0.01'], axis=1, inplace=True)

<h1 style='background-color: #defcdc; border: 1px solid #a1d194; padding: 10px; font-weight: 400; text-align:center'>Medicine [Feature engineering]</h1>

In [710]:
many_medicine = []

for i in range(len(df['osm_healthcare_points_in_0.005'])):
    if df['osm_healthcare_points_in_0.005'][i] > 1 or df['osm_healthcare_points_in_0.0075'][i] > 1 or df['osm_healthcare_points_in_0.01'][i] > 1:
        many_medicine.append(1)
    else:
        many_medicine.append(0)
        
df['many_medicine'] = many_medicine

df.drop(['osm_healthcare_points_in_0.005', 'osm_healthcare_points_in_0.0075', 'osm_healthcare_points_in_0.01'], axis=1, inplace=True)

<h1 style='background-color: #defcdc; border: 1px solid #a1d194; padding: 10px; font-weight: 400; text-align:center'>Entertainment [Feature engineering]</h1>

In [711]:
many_entertainment = []

for i in range(len(df['osm_leisure_points_in_0.005'])):
    if df['osm_leisure_points_in_0.005'][i] > 1 or df['osm_leisure_points_in_0.0075'][i] > 3 or df['osm_leisure_points_in_0.01'][i] > 1:
        many_entertainment.append(1)
    else:
        many_entertainment.append(0)
        
df['many_entertainment'] = many_entertainment

df.drop(['osm_leisure_points_in_0.005', 'osm_leisure_points_in_0.0075', 'osm_leisure_points_in_0.01'], axis=1, inplace=True)

<h1 style='background-color: #defcdc; border: 1px solid #a1d194; padding: 10px; font-weight: 400; text-align:center'>Historical objects [Feature engineering]</h1>

In [712]:
many_historical_objects = []

for i in range(len(df['osm_historic_points_in_0.005'])):
    if df['osm_historic_points_in_0.005'][i] > 1 or df['osm_historic_points_in_0.0075'][i] > 3 or df['osm_historic_points_in_0.01'][i] > 1:
        many_historical_objects.append(1)
    else:
        many_historical_objects.append(0)
        
df['many_historical_objects'] = many_historical_objects

df.drop(['osm_historic_points_in_0.005', 'osm_historic_points_in_0.0075', 'osm_historic_points_in_0.01'], axis=1, inplace=True)

<h1 style='background-color: #defcdc; border: 1px solid #a1d194; padding: 10px; font-weight: 400; text-align:center'>Buildings [Feature engineering]</h1>

In [713]:
many_buildings = []

for i in range(len(df['osm_building_points_in_0.001'])):
    if df['osm_building_points_in_0.001'][i] > 2 or df['osm_building_points_in_0.005'][i] > 5:
        many_buildings.append(1)
    else:
        many_buildings.append(0)
        
df['many_buildings'] = many_buildings

df.drop(['osm_building_points_in_0.001', 'osm_building_points_in_0.005'], axis=1, inplace=True)

In [714]:
print_unique_values(df)

<h1 style='background-color: #defcdc; border: 1px solid #a1d194; padding: 10px; font-weight: 400; text-align:center'>Hotels [Feature engineering]</h1>

In [715]:
many_hotels = []

for i in range(len(df['osm_hotels_points_in_0.005'])):
    if df['osm_hotels_points_in_0.005'][i] > 1 or df['osm_hotels_points_in_0.0075'][i] > 1 or df['osm_hotels_points_in_0.01'][i] > 1:
        many_hotels.append(1)
    else:
        many_hotels.append(0)
        
df['many_hotels'] = many_hotels

df.drop(['osm_hotels_points_in_0.005', 'osm_hotels_points_in_0.0075', 'osm_hotels_points_in_0.01'], axis=1, inplace=True)

<h1 style='background-color: #defcdc; border: 1px solid #a1d194; padding: 10px; font-weight: 400; text-align:center'>Station score [Feature engineering]</h1>

In [716]:
station_rate = []

for row in df['osm_train_stop_closest_dist']:
    if row > 4:
        station_rate.append(0)
    elif row <= 4 and row > 1:
        station_rate.append(5)
    else:
        station_rate.append(1)
        
df['station_rate'] = station_rate

df.drop(['osm_train_stop_closest_dist'], axis=1, inplace=True)

<h1 style='background-color: #defcdc; border: 1px solid #a1d194; padding: 10px; font-weight: 400; text-align:center'>Stations [Feature engineering]</h1>

In [717]:
many_stations = []

for i in range(len(df['osm_train_stop_points_in_0.005'])):
    if df['osm_train_stop_points_in_0.005'][i] > 1 or df['osm_train_stop_points_in_0.0075'][i] > 1 or df['osm_train_stop_points_in_0.01'][i] > 1:
        many_stations.append(1)
    else:
        many_stations.append(0)
        
df['many_stations'] = many_stations

df.drop(['osm_train_stop_points_in_0.005', 'osm_train_stop_points_in_0.0075', 'osm_train_stop_points_in_0.01'], axis=1, inplace=True)

<h1 style='background-color: #defcdc; border: 1px solid #a1d194; padding: 10px; font-weight: 400; text-align:center'>Land transport [Feature engineering]</h1>

In [718]:
many_land_transport = []

for i in range(len(df['osm_transport_stop_points_in_0.005'])):
    if df['osm_transport_stop_points_in_0.005'][i] > 1 or df['osm_transport_stop_points_in_0.0075'][i] > 1 or df['osm_transport_stop_points_in_0.01'][i] > 1:
        many_land_transport.append(1)
    else:
        many_land_transport.append(0)
        
df['many_land_transport'] = many_land_transport

df.drop(['osm_transport_stop_points_in_0.005', 'osm_transport_stop_points_in_0.0075', 'osm_transport_stop_points_in_0.01'], axis=1, inplace=True)

<h1 style='background-color: #defcdc; border: 1px solid #a1d194; padding: 10px; font-weight: 400; text-align:center'>Pedestrian crossings [Feature engineering]</h1>

In [719]:
many_pedestrian_crossings = []

for i in range(len(df['osm_crossing_points_in_0.001'])):
    if df['osm_crossing_points_in_0.001'][i] > 1 or df['osm_crossing_points_in_0.005'][i] > 1 or df['osm_crossing_points_in_0.0075'][i] > 1 or df['osm_crossing_points_in_0.01'][i]:
        many_pedestrian_crossings.append(1)
    else:
        many_pedestrian_crossings.append(0)
        
df['many_pedestrian_crossings'] = many_pedestrian_crossings

df.drop(['osm_crossing_points_in_0.001', 'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075', 'osm_crossing_points_in_0.01'], axis=1, inplace=True)

<h1 style='background-color: #defcdc; border: 1px solid #a1d194; padding: 10px; font-weight: 400; text-align:center'>Culture objects [Feature engineering]</h1>

In [720]:
many_culture_objects = []

for i in range(len(df['osm_culture_points_in_0.001'])):
    if df['osm_culture_points_in_0.001'][i] > 1 or df['osm_culture_points_in_0.005'][i] > 1 or df['osm_culture_points_in_0.0075'][i] > 1  or df['osm_culture_points_in_0.01'][i] > 1:
        many_culture_objects.append(1)
    else:
        many_culture_objects.append(0)
        
df['many_culture_objects'] = many_culture_objects

df.drop(['osm_culture_points_in_0.001', 'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075', 'osm_culture_points_in_0.01'], axis=1, inplace=True)

<h1 style='background-color: #defcdc; border: 1px solid #a1d194; padding: 10px; font-weight: 400; text-align:center'>Comfort objects [Feature engineering]</h1>

In [721]:
many_comfort_objects = []

for i in range(len(df['osm_amenity_points_in_0.001'])):
    if df['osm_amenity_points_in_0.001'][i] > 1 or df['osm_amenity_points_in_0.005'][i] > 1 or df['osm_amenity_points_in_0.0075'][i] > 1  or df['osm_amenity_points_in_0.01'][i] > 1:
        many_comfort_objects.append(1)
    else:
        many_comfort_objects.append(0)
        
df['many_comfort_objects'] = many_comfort_objects

df.drop(['osm_amenity_points_in_0.001', 'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075', 'osm_amenity_points_in_0.01'], axis=1, inplace=True)

<h1 style='background-color: #defcdc; border: 1px solid #a1d194; padding: 10px; font-weight: 400; text-align:center'>Quick Summary</h1>

In [722]:
print_unique_values(df)

<h1 style='background-color: #fecfffcd; border: 1px solid #d194cf; padding: 10px; font-weight: 400; text-align:center'>Analys</h1>

In [723]:
print_missing_values(df)

<h1 style='background-color: #fecfffcd; border: 1px solid #d194cf; padding: 10px; font-weight: 400; text-align:center'>Filling NA's</h1>

In [724]:
df['reform_house_population_500'] = df['reform_house_population_500'].fillna(df['reform_house_population_500'].median())
df['reform_house_population_1000'] = df['reform_house_population_1000'].fillna(df['reform_house_population_1000'].median())
df['reform_mean_floor_count_1000'] = df['reform_mean_floor_count_1000'].apply(lambda x: np.log(x+1))
df['reform_mean_floor_count_1000'] = df['reform_mean_floor_count_1000'].fillna(df['reform_mean_floor_count_1000'].mean())

<h1 style='background-color: #fecfffcd; border: 1px solid #d194cf; padding: 10px; font-weight: 400; text-align:center'>'osm_city_nearest_population' missing values = 100%</h1>

In [725]:
print_missing_values(df[df['osm_city_nearest_population'].isnull()])

#### Summary
All other features contain many missing values, so we will delete all this rows

In [726]:
df = df[df['osm_city_nearest_population'].notna()]

<h1 style='background-color: #fecfffcd; border: 1px solid #d194cf; padding: 10px; font-weight: 400; text-align:center'>'street' missing values = 100%</h1>

In [727]:
print_missing_values(df[df['street'].isnull()])

#### Summary
All other features contain many missing values, so we will delete all this rows

In [728]:
df = df[df['street'].notna()]

<h1 style='background-color: #fecfffcd; border: 1px solid #d194cf; padding: 10px; font-weight: 400; text-align:center'>'floor' missing values = 100%</h1>

In [729]:
print_missing_values(df[df['floor'].isnull()])

<h1 style='background-color: #fecfffcd; border: 1px solid #d194cf; padding: 10px; font-weight: 400; text-align:center'>'reform_mean_floor_count_500' missing values = 100%</h1>

In [730]:
print_missing_values(df[df['reform_mean_floor_count_500'].isnull()])

<h1 style='background-color: #fecfffcd; border: 1px solid #d194cf; padding: 10px; font-weight: 400; text-align:center'>'reform_mean_year_building_1000' missing values = 100%</h1>

In [731]:
print_missing_values(df[df['reform_mean_year_building_1000'].isnull()])

<h1 style='background-color: #fecfffcd; border: 1px solid #d194cf; padding: 10px; font-weight: 400; text-align:center'>'reform_mean_year_building_500' missing values = 100%</h1>

In [732]:
print_missing_values(df[df['reform_mean_year_building_500'].isnull()])

<h1 style='background-color: #fecfffcd; border: 1px solid #d194cf; padding: 10px; font-weight: 400; text-align:center'>Summary</h1>

In [733]:
print_missing_values(df)

<h1 style='background-color: #fecfffcd; border: 1px solid #d194cf; padding: 10px; font-weight: 400; text-align:center'>Fill 'reform_mean_floor_count_500' NA's by medians</h1>

In [734]:
df['reform_mean_floor_count_500'] = df['reform_mean_floor_count_500'].fillna(df['reform_mean_floor_count_500'].median())
df['reform_mean_year_building_1000'] = df['reform_mean_year_building_1000'].fillna(df['reform_mean_year_building_1000'].median())
df['reform_mean_year_building_500'] = df['reform_mean_year_building_500'].fillna(df['reform_mean_year_building_500'].median())

In [735]:
print_missing_values(df)

# Pipeline

In [736]:
# numerical_cols = ['osm_city_nearest_population', 'reform_house_population_1000', 'reform_house_population_500',
#                  'reform_mean_floor_count_1000', 'reform_mean_floor_count_500', 'reform_mean_year_building_1000',
#                  'reform_mean_year_building_500']

# categorial_cols = ['floor'] # street

# numerical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median'))
# ])

# categorial_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(sparse=False, handle_unknown = 'ignore'))
# ])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('numerical', numerical_transformer, numerical_cols),
#         ('categorial', categorial_transformer, categorial_cols)
#     ])

# pipeline = Pipeline(steps=[
#     ('preprocess', preprocessor),
# ])

# Split data

In [737]:
# X_cols = numerical_cols + categorial_cols

# y_cols = ['per_square_meter_price']

# X, y = df[X_cols][:100], df[y_cols][:100]

# pipeline.fit(X);

In [738]:
# X_transformed = pipeline.transform(X)

# X_transformed

# Search for the best params

In [739]:
# lg = LinearRegression()

# lg.fit(X_transformed, y)

# lg.score(X_transformed, y)

In [740]:
# forest = RandomForestRegressor()

# forest.fit(X_transformed, y)

In [741]:
# importance = forest.feature_importances_
# # summarize feature importance
# for i,v in enumerate(importance):
# 	print('Feature: %0d, Score: %.5f' % (i,v))
# # plot feature importance
# plt.bar([x for x in range(len(importance))], importance)
# plt.show()