![Rampart Logo](../images/logo.png)

Interactive visualization and data analysis notebook. There's no model training here, instead this tool is supposed to be used against the parsed database, especially its table **flats**.

In [None]:
%matplotlib inline

In [None]:
from pandas import read_sql
from tabulate import tabulate
from sqlalchemy import create_engine
from os import environ
from scipy.stats import zscore
from numpy import abs, exp
from ppscore import matrix
from plotly.graph_objs import Histogram, Scatter, Figure, Bar, Heatmap
from plotly.subplots import make_subplots

In [None]:
engine = create_engine(environ['RAMPART_DSN'])

In [None]:
def aggregate():
    with engine.connect() as connection:
        return read_sql(
            'select city, count(*) as count from flats group by city order by count desc limit 10',
            connection,
            index_col=['city']
        )

In [None]:
figure = Figure()
cities = aggregate()
figure.add_trace(Bar(x=cities.index, y=cities['count']))
figure.update_layout(margin={'t': 30, 'r': 30, 'b': 30, 'l': 30}, height=400)
figure.show()

In [None]:
def read():
    with engine.connect() as connection:
        return read_sql(
            '''
            select id,
                   price,
                   total_area,
                   living_area,
                   kitchen_area,
                   room_number,
                   floor,
                   total_floor,
                   case
                       when housing = 'primary' then 0
                       else 1
                       end     as housing,
                   ssf,
                   izf,
                   gzf
            from flats
            where city = 'Київ'
            ''',
            connection,
            index_col=['id']
        )

In [None]:
flats = read()

In [None]:
interests = ['price', 'total_area', 'room_number', 'floor', 'total_floor', 'ssf', 'izf', 'gzf']

In [None]:
def render(frame):
    print(tabulate(frame, headers='keys', tablefmt='psql', numalign='right'))

In [None]:
render(flats[interests].head(10))

In [None]:
flats.info()

In [None]:
render(flats[interests].describe())

In [None]:
print(
    'Quite affordable flat amount:',
    len(flats[(flats["price"] <= 100000) & (flats["ssf"] >= 2) & (flats["gzf"] >= 2)]),
    '/',
    len(flats),
    '.'
)

In [None]:
majority = flats[(abs(zscore(flats)) < 2).all(1)]

In [None]:
floats = ['price', 'total_area', 'living_area', 'kitchen_area', 'ssf', 'izf', 'gzf']
figure = make_subplots(rows=len(floats))
for i, column in enumerate(floats):
    figure.add_trace(Histogram(x=majority[column], name=column, nbinsx=100), row=i + 1, col=1)
figure.update_layout(margin={'t': 30, 'r': 30, 'b': 30, 'l': 30}, height=len(floats) * 300)
figure.show()

In [None]:
def countplot(column):
    figure = Figure()
    for i, housing in enumerate(['primary', 'secondary']):
        counts = flats[flats['housing'] == i][column].value_counts(sort=False)
        figure.add_trace(Bar(x=counts.index, y=counts.values, name=housing))
    figure.update_layout(
        title=column,
        barmode='stack',
        margin={'t': 30, 'r': 30, 'b': 30, 'l': 30},
        height=400
    )
    figure.show()

In [None]:
countplot('room_number')

In [None]:
countplot('floor')

In [None]:
countplot('total_floor')

In [None]:
correlation = flats.corr()
ppscore = matrix(flats)[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')
figure = make_subplots(cols=2, horizontal_spacing=0.18)
figure.add_trace(
    Heatmap(
        x=correlation.columns,
        y=correlation.index,
        z=correlation,
        zmin=-1,
        zmax=1,
        colorbar={'x': 0.41},
        name=''
    ),
    row=1,
    col=1
)
figure.add_trace(
    Heatmap(
        x=ppscore.columns,
        y=ppscore.index,
        z=ppscore,
        zmin=0,
        zmax=1,
        colorbar={'x': 1},
        name=''
    ),
    row=1, 
    col=2
)
figure.update_layout(
    margin={'t': 30, 'r': 5, 'b': 30, 'l': 5},
    height=420,
    title='correlation vs ppscore'
)
figure.show()

In [None]:
def scatterplot(column1, column2='price'):
    figure = Figure()
    for i, housing in enumerate(['primary', 'secondary']):
        apartments = flats[flats['housing'] == i]
        figure.add_trace(Scatter(x=apartments[column1], y=apartments[column2], name=housing, mode='markers'))
    figure.update_layout(
        title=f'{column1} vs {column2}',
        margin={'t': 30, 'r': 30, 'b': 30, 'l': 30},
        height=400
    )
    figure.show()

In [None]:
scatterplot('total_area')

In [None]:
scatterplot('living_area')

In [None]:
scatterplot('kitchen_area')

In [None]:
scatterplot('ssf')

In [None]:
scatterplot('izf')

In [None]:
scatterplot('gzf')

In [None]:
scatterplot('gzf', 'izf')