import flats as flats
<h1>Flat prices analysis in St. Petersburg</h1>

<div style="background:#abd5f5; border:1px solid #b3deff; padding: 20px">
    <h2 style="color:#002b63">Table of content</h2>
<ul>
    <li>Indroduction</li>
    <li>Data Preprocessing</li>
    <li>Visualization</li>
    <li>Model building</li>
    <li>Results</li>
    <li>Conclusion</li>
</ul>
    </div>

<h2>Indroduction</h2>

This analysis focuses on apartment prices in St. Petersburg.

<h2>Data Description</h2>

I am going to use my own CSV file. I created it using data from https://spb.cian.ru/kupit-kvartiru/ This file contains a list of St. Petersburg apartments for sale.

In [1]:
import pandas as pd
import numpy as np
import re #regexp
from ipywidgets import IntProgress #for progress bar
from IPython.display import display

Next I replace 'Студия' with 0.5 and 'Многокомнатная' with 6. Values 'Апартаменты','Квартира' don't contain information about the number of rooms so I replace them with the most frequent value. Also I convert the column into float.

In [None]:
flats['rooms']=flats['rooms'].replace(['Студия','Многокомнатная','Многокомнатные'],['0.5',6,6])
flats['rooms']=flats['rooms'].replace(['Апартаменты','Квартира'],flats['rooms'].value_counts().idxmax())
flats['rooms']=flats['rooms'].astype(float)
flats[['rooms']].head()

Next I will translate all categorical values in English.

In [None]:
flats['type']=flats['type'].replace(['Вторичка','Новостройка'],['secondary','new'])
flats['view']=flats['view'].replace(['На улицу и двор','Во двор','На улицу'],['both','view_courtyard','view_street'])
flats['finishing']=flats['finishing'].replace(['Чистовая','Нет','Черновая'],['fine','none','rough'])
flats['layout']=flats['layout'].replace(['Изолированная','Смежно-изолированная','Смежная'],['isolated','mixed','adjoining'])
flats['renovation']=flats['renovation'].replace(['Евроремонт','Без ремонта','Косметический','Дизайнерский'],
                                                ['euro','none','redecorating','designer'])

The next step is finding and removing missing data. Firstly, lets indentify NaNs and zeros.

In [None]:
pd.isna(flats).sum()

Now I will find out the proportion between living_area, kitchen_area, total_area and replace missing data with the most common proportion.

In [None]:
flats['living_part']=flats['living_area']/flats['total_area']
flats['kitchen_part']=flats['kitchen_area']/flats['total_area']
flats['living_area']=flats['living_area'].combine_first(flats['total_area']*flats['living_part'].mean())
flats['kitchen_area']=flats['kitchen_area'].combine_first(flats['total_area']*flats['kitchen_part'].mean())
flats.drop(['living_part','kitchen_part'],axis='columns',inplace=True)

I replace missing data in columns 'view', 'finishing', 'layout', 'renovation','year_of_construction' with the most popular value.

In [None]:
flats['view']=flats['view'].replace(np.nan,flats['view'].value_counts().idxmax())
flats['finishing']=flats['finishing'].replace(np.nan,flats['finishing'].value_counts().idxmax())
flats['layout']=flats['layout'].replace(np.nan,flats['layout'].value_counts().idxmax())
flats['renovation']=flats['renovation'].replace(np.nan,flats['renovation'].value_counts().idxmax())
flats['year_of_construction']=flats['year_of_construction'].replace(np.nan,flats['year_of_construction'].value_counts().idxmax())

I replace the height with the mean value. 

In [None]:
flats['height']=flats['height'].replace(np.nan,flats['height'].mean())

I replace missing deadlines with years of constructions.

In [None]:
flats['deadline']=flats['deadline'].combine_first(flats['year_of_construction'])


<h2>Visualization</h2>

In [None]:
import googlemaps
f=open('keys/api_google_maps_key')
mykey=f.read()
f.close()
gmaps = googlemaps.Client(key=mykey)

In [None]:
flats['lat']=0.0
flats['lng']=0.0

progress = IntProgress(min=0, max=len(flats), value=0) #progress bar
display(progress)

for i in range(len(flats)):
    geocode_result = gmaps.geocode(flats.loc[i,'city']+', '+flats.loc[i,'street'])
    flats.loc[i,'lat']=geocode_result[0]['geometry']['location']['lat']
    flats.loc[i,'lng']=geocode_result[0]['geometry']['location']['lng']
    progress.value = i

In [None]:
flats.to_csv('flats_cian_1.csv',sep=';',index=False)

In [None]:
flats=pd.read_csv('flats_cian_1.csv',sep=';')

Next I create a map

In [None]:
import folium
map_piter = folium.Map(location=[59.9810199, 30.3540484], zoom_start=9)

# add markers to map
for lat, lng, price, name in zip(flats['lat'],flats['lng'],flats['price'],flats['link']):
    label = '{}'.format(round(price))
    label = folium.Popup(label, parse_html=True)
    if price<5000000:
        clr='#00ffff'
    elif price<10000000:
        clr='#91e2da'
    elif price<15000000:
        clr='#d9a694'
    elif price<20000000:
        clr='#eb8473'
    elif price<25000000:
        clr='#f75b53'
    else:
        clr='#ff0035'
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=clr,
        fill=True,
        fill_color=clr,
        fill_opacity=0.8,
        parse_html=False).add_to(map_piter)
    
map_piter

<h1>Model building</h1>

Here I'm going to build the predictive model. Price is dependent value. Other features are independent ones. 

Firstly, I need to prepare the dataframe. I create a copy of 'flats' and delete unnecessary fielda

In [None]:
flats_model=flats.copy()
flats_model.drop(['link','city','neighborhood','street','lat','lng','type','view','finishing','layout',
                  'renovation','area','metro_name'],axis='columns',inplace=True)

Next I need to turn all categorical values into numerical. I use 'get_dummies' function from pandas.

In [None]:
cat_df=flats[['type','view','finishing','layout','renovation']]
cat_df=pd.get_dummies(data=cat_df,drop_first=True)

#area_df=pd.get_dummies(flats_model['area'])
#metro_name_df=pd.get_dummies(flats_model['metro_name'])

#Join all the dataframes in one
flats_model=pd.concat([flats_model,cat_df],axis=1)

# Check the result
flats_model.head()

Next step is searching correlation among all features.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

corr=flats_model.corr()
fig, ax = plt.subplots(figsize=(15,10))

heatmap=sns.heatmap(corr,center=0,ax=ax)

In [None]:
corr[(corr['price']<-0.2)|(corr['price']>0.2)]['price']

In [None]:
drop_columns=corr[(corr['price']>=-0.2)&(corr['price']<=0.2)].index
flats_model.drop(drop_columns,axis='columns',inplace=True)

In [None]:
corr=flats_model.iloc[:,1:].corr()
corr[corr.iloc[:]>0.7]

In [None]:
flats_model.drop(['living_area'],axis='columns',inplace=True)

In [None]:
flats_model.head()

In [None]:
flats_model.corr()

In [None]:
#Построить модели для r=0.1 и для всех параметров
#Построить модели для цены за квадратный метр
#Разделить файла на вебскрапинг и анализ