![Rampart Logo](../images/logo.png)

Twinkle is a flat ranking model based on the learning to rank (LTR) technique. [Lambdarank](https://www.microsoft.com/en-us/research/uploads/prod/2016/02/MSR-TR-2010-82.pdf) is used as the main logical engine. The final ranker is supposed to relevantly order housing publications to a final Telegram user.

## I/O
All datasets lie in `../scientific/binaries` . Final model must be stored into `../scientific/models/twinkle.latest.txt` .

## Metadata
Dataset files have self-explained names:
```
<tag>.<group>.bin
```
For instance:
```
70e36b1e47754064ad14b57a5c79d5bc.validation.bin
```
Placeholders in angle brackets:
- `tag` , UUID4 generated hash.
- `group` , dataset's destination - one of `training` or `validation` .

## Features
- `actual_price` - true flat's price (in USD).
- `utmost_price` - query's price limit (in USD), search results shouldn't exceed much this shape.
- `total_area` - overall apartment's area (in square meters).
- `living_area` - flat's living room area (in square meters).
- `kitchen_area` - flat's kitchen area (in square meters).
- `actual_room_number` - true flat's living room amount.
- `desired_room_number` - target room count.
- `actual_floor` - apartment's floor (the ground floor is the floor #1).
- `total_floor` - house's floor count.
- `desired_floor` - target flat's floor.
- `housing` - either a newbuild or a used apartments.
- `ssf` - Subway Station Factor, the score indicating about subway stations nearby.
- `izf` - Industrial Zone Factor, the score indicating about factories & plants nearby.
- `gzf` - Green Zone Factor, the score indicating about parks nearby.
- `abandoned_count` - unavailable/not found photo quantity.
- `luxury_count` - elite housing photo number.
- `comfort_count` - ordinary flat pictures.
- `junk_count` - obsolete apartment interior photo amount.
- `construction_count` - raw building images.
- `excess_count` - trash photo amount.
- `panorama_count` - panorama (360 deg) image number.

## Categorical data
All feature categories are sorted according to their ranks from the lowest to the highest:
- `desired_room_number`
    * `any` - not matter how many rooms;
    * `one` - 1 room;
    * `two` - 2 rooms;
    * `three` - 3 rooms;
    * `many` - huge (4+) luxurious apartments with many rooms;
- `desired_floor`
    * `any` - not matter what floor;
    * `low` - low floors are preferred;
    * `high` - top floors are preferred;
- `housing`
    * `primary` - newbuilds & houses under construction;
    * `secondary` - old & already used apartments;
- `relevance`
    * `terrible` - don't show this thing again!
    * `bad` - poor quality;
    * `so-so` - average result;
    * `good` - quite smart search;
    * `excellent` - the best matches.

In [None]:
%matplotlib inline

In [None]:
from lightgbm import train, Dataset, Booster
from uuid import uuid4
from sqlalchemy import create_engine
from os import environ
from pandas import read_sql
from IPython.core.display import display, HTML
from jinja2 import Template

In [None]:
def fit():
    training_dataset = Dataset('../scientific/binaries/latest.training.bin')
    validation_dataset = training_dataset.create_valid('../scientific/binaries/latest.validation.bin')
    booster = train(
        {'objective': 'lambdarank', 'metric': 'ndcg', 'force_row_wise': True},
        training_dataset,
        15,
        [validation_dataset],
        early_stopping_rounds=10
    )
    booster.save_model('../scientific/models/twinkle.latest.txt')
    booster.save_model(f'../scientific/models/twinkle.{uuid4().hex}.txt')

In [None]:
%%time
fit()

In [None]:
engine = create_engine(environ['RAMPART_DATABASE_DSN'])

In [None]:
def read(city, price, room_number, floor):
    price_clause = ''
    if price > 0:
        price_clause = f'and price <= {1.65 * price}'
    room_number_clause = ''
    if room_number == 4:
        room_number_clause = f'and room_number >= {room_number}'
    elif room_number != 0:
        room_number_clause = f'and room_number = {room_number}'
    with engine.connect() as connection:
        return read_sql(
            f'''
            select flats.id,
                   flats.url,
                   price as actual_price,
                   %s as utmost_price,
                   total_area,
                   living_area,
                   kitchen_area,
                   room_number as actual_room_number,
                   %s as desired_room_number,
                   floor as actual_floor,
                   total_floor,
                   %s as desired_floor,
                   case when housing = 'primary' then 0 else 1 end as housing,
                   st_x(point) as longitude,
                   st_y(point) as latitude,
                   street,
                   house_number,
                   ssf,
                   izf,
                   gzf,
                   sum(case when kind = 'photo' and label = 'abandoned' then 1 else 0 end) as abandoned_count,
                   sum(case when kind = 'photo' and label = 'luxury' then 1 else 0 end) as luxury_count,
                   sum(case when kind = 'photo' and label = 'comfort' then 1 else 0 end) as comfort_count,
                   sum(case when kind = 'photo' and label = 'junk' then 1 else 0 end) as junk_count,
                   sum(
                       case
                           when kind = 'photo'
                                and label = 'construction' then 1
                           else 0
                           end) as construction_count,
                   sum(case when kind = 'photo' and label = 'excess' then 1 else 0 end) as excess_count,
                   sum(case when kind = 'panorama' then 1 else 0 end) as panorama_count
            from flats
                 join images on flats.id = flat_id
            where city = %s
            {price_clause}
            {room_number_clause}
            group by flats.id
            having sum(case when kind = 'photo' and label = 'unknown' then 1 else 0 end) = 0
            ''',
            connection,
            params=[price, room_number, floor, city]
        )

In [None]:
class Flat:
    def __init__(self, series):
        self.id = series['id']
        self.url = series['url']
        self.price = series['actual_price']
        self.total_area = series['total_area']
        self.living_area = series['living_area']
        self.kitchen_area = series['kitchen_area']
        self.room_number = series['actual_room_number']
        self.floor = series['actual_floor']
        self.total_floor = series['total_floor']
        self.housing = series['housing']
        self.longitude = series['longitude']
        self.latitude = series['latitude']
        self.city = series['city']
        self.street = series['street']
        self.house_number = series['house_number']
        self.ssf = series['ssf']
        self.izf = series['izf']
        self.gzf = series['gzf']
        self.abandoned_count = series['abandoned_count']
        self.luxury_count = series['luxury_count']
        self.comfort_count = series['comfort_count']
        self.junk_count = series['junk_count']
        self.construction_count = series['construction_count']
        self.excess_count = series['excess_count']
        self.panorama_count = series['panorama_count']
        self.score = series['score']

    @property
    def address(self):
        return ', '.join(s for s in [self.city, self.street, self.house_number] if s != '')

In [None]:
def rank(tag='latest', city='Київ', price=0, room_number=0, floor=0, limit=5, offset=0):
    frame = read(city, price, room_number, floor)
    if len(frame) == 0:
        return
    booster = Booster(model_file=f'../scientific/models/twinkle.{tag}.txt')
    frame['score'] = booster.predict(
        frame.drop(columns=['id', 'url', 'longitude', 'latitude', 'street', 'house_number']),
        num_iteration=booster.best_iteration
    )
    flats = frame.sort_values('score', ascending=False).iloc[limit * offset:limit * (offset + 1)]
    flats['city'] = city
    flats['housing'] = flats['housing'].map(lambda h: 'primary' if h == 0 else 'secondary')
    with open('../templates/flats.html') as stream:
        display(HTML(Template(stream.read()).render(flats=[Flat(s) for _, s in flats.iterrows()]))) 

In [None]:
%%time
rank()