![Rampart Logo](../images/logo.png)

Twinkle is a flat relevance binary classifier. It leverages [LightGBM](https://lightgbm.readthedocs.io/en/latest/)
to sieve acceptable & inappropriate flat publications via boosting methods. The model takes into account multiple
geographical, user-specific & image-based features to decide whether an ordinary flat can be sent to a particular
customer. Picture-relevant data is preliminary calculated by another module called Auge .

## Features
- `actual_price` - true flat's price (in USD).
- `utmost_price` - query's price limit (in USD), search results shouldn't exceed much this shape.
- `total_area` - overall apartment's area (in square meters).
- `living_area` - flat's living room area (in square meters).
- `kitchen_area` - flat's kitchen area (in square meters).
- `actual_room_number` - true flat's living room amount.
- `desired_room_number` - target room count.
- `actual_floor` - apartment's floor (the ground floor is the floor #1).
- `total_floor` - house's floor count.
- `desired_floor` - target flat's floor.
- `housing` - either a newbuild or a used apartments.
- `ssf` - Subway Station Factor, the score indicating about subway stations nearby.
- `izf` - Industrial Zone Factor, the score indicating about factories & plants nearby.
- `gzf` - Green Zone Factor, the score indicating about parks nearby.
- `abandoned_count` - unavailable/not found photo quantity.
- `luxury_count` - elite housing photo number.
- `comfort_count` - ordinary flat pictures.
- `junk_count` - obsolete apartment interior photo amount.
- `construction_count` - raw building images.
- `excess_count` - trash photo amount.
- `panorama_count` - panorama (360 deg) image number.

## Categorical data
All feature categories are sorted according to their ranks from the lowest to the highest:
- `desired_room_number`
    * `any` - not matter how many rooms.
    * `one` - 1 room.
    * `two` - 2 rooms.
    * `three` - 3 rooms.
    * `many` - huge (4+) luxurious apartments with many rooms.
- `desired_floor`
    * `any` - not matter what floor.
    * `low` - low floors are preferred.
    * `high` - top floors are preferred.
- `housing`
    * `primary` - newbuilds & houses under construction.
    * `secondary` - old & already used apartments.

In [None]:
from pandas import read_csv
from plotly.graph_objs import Pie, Figure, Scatter
from plotly.subplots import make_subplots
from plotly.figure_factory import create_annotated_heatmap
from uuid import uuid4
from lightgbm import train, Dataset, Booster
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, f1_score

In [None]:
room_numbers = ['any', 'one', 'two', 'three', 'many']
floors = ['any', 'low', 'high']
housings = ['primary', 'secondary']
labels = ['bad', 'good']
groups = ['training', 'validation', 'testing']

In [None]:
def extract(tag='latest'):
    frame = read_csv(f'../scientific/tables/{tag}.csv')
    samples = set(frame['group'])
    if samples != set(groups):
        raise RuntimeError(f'Got invalid groups, {samples}')
    mappings = {n: i for i, n in enumerate(room_numbers)}
    frame['desired_room_number'] = frame['desired_room_number'].map(lambda n: mappings[n])
    mappings = {f: i for i, f in enumerate(floors)}
    frame['desired_floor'] = frame['desired_floor'].map(lambda f: mappings[f])
    mappings = {h: i for i, h in enumerate(housings)}
    frame['housing'] = frame['housing'].map(lambda h: mappings[h])
    mappings = {l: i for i, l in enumerate(labels)}
    frame['label'] = frame['label'].map(lambda l: mappings[l])
    return frame

In [None]:
flats = extract()
flats.info()

In [None]:
figure = Figure()
counts = flats['group'].value_counts()
figure.add_trace(Pie(labels=counts.index, values=counts.values, name=''))
figure.update_layout(legend={'x': 0.7})
figure.show()

In [None]:
def pies(column, enum):
    figure = make_subplots(
        cols=len(groups),
        specs=[[{'type': 'domain'}] * len(groups)],
        subplot_titles=groups
    )
    for i, group in enumerate(groups, 1):
        counts = flats[flats['group'] == group][column].value_counts().sort_index()
        figure.add_trace(
            Pie(labels=[enum[j] for j in counts.index], values=counts.values, name=''),
            row=1,
            col=i
        )
    figure.show()

In [None]:
pies('desired_room_number', room_numbers)

In [None]:
pies('desired_floor', floors)

In [None]:
pies('housing', housings)

In [None]:
pies('label', labels)

In [None]:
def serialize(group, reference=None):
    frame = flats[flats['group'] == group]
    return Dataset(frame.drop(columns=['label', 'group']), frame['label'], reference=reference)

In [None]:
def fit():
    training_dataset = serialize('training')
    validation_dataset = serialize('validation', training_dataset)
    booster = train(
        {'objective': 'binary', 'metric': ['binary_logloss', 'auc'], 'force_row_wise': True},
        training_dataset,
        30,
        [validation_dataset],
        early_stopping_rounds=10
    )
    booster.save_model(f'../scientific/models/twinkle.{uuid4().hex}.txt')
    booster.save_model('../scientific/models/twinkle.latest.txt')

In [None]:
%%time
fit()

In [None]:
def infer(tag='latest'):
    frame = flats[flats['group'] == 'testing']
    booster = Booster(model_file=f'../scientific/models/twinkle.{tag}.txt')
    predicted = booster.predict(frame.drop(columns=['label', 'group'])).round(0).astype(int)
    print(
        f'Accuracy: {accuracy_score(frame["label"], predicted):.4f}  ' +
        f'AUC: {roc_auc_score(frame["label"], predicted):.4f}  ' +
        f'F1: {f1_score(frame["label"], predicted):.4f}'
    )
    figure = create_annotated_heatmap(
        z=confusion_matrix(frame['label'], predicted),
        x=labels,
        y=labels,
        hoverinfo='skip'
    )
    figure.update_xaxes(title_text='Predicted')
    figure.update_yaxes(title_text='Actual', autorange='reversed')
    figure.show()

In [None]:
infer()