In [1]:
import requests
import pandas as pd
import time
import sqlite3

# Seminar - APIs, DBs and Live coding

## Task 1: Requesting API
### 1a. Create a function requesting data from sreality


```python
base_url = 'https://www.sreality.cz/api/cs/v2/estates?category_main_cb=1&category_type_cb=1&locality_region_id=10&per_page60&page={}'.format(i)

r = requests.get(base_url)
d = r.json()
```

* function should parametrize: 
    * `category_main_cb` - `{'flat':1, 'house':2, 'land':3 }`
    * `category_type_cb` - `{'sell':1,'rent':2}`
    * `locality_region_id` - use 10 as default value
    * `page` parameter
* use string inputs for `category_main_cb` and `category_type_cb`
* include try/except clause to handle errors
* function should return JSON data in python types
* do not forget to sleep each request at least 0.5s

In [5]:
def request_sreality(page, category_main_str, category_type_str, locality_region_id=10):
    category_mains = {'flat':1, 'house':2, 'land':3 }
    category_types = {'sell':1,'rent':2}
    template_url = 'https://www.sreality.cz/api/cs/v2/estates?category_main_cb={category_main}&category_type_cb={category_type}&locality_region_id={locality_region_id}&per_page60&page={page}'
    
    request_url = template_url.format(
        category_main=category_mains[category_main_str],
        category_type=category_types[category_type_str],
        locality_region_id=locality_region_id,
        page=page
    )
    r = requests.get(request_url)
    return r.json()
d = request_sreality(0, 'flat', 'sell', 10)

### 1b. Create a function converting sreality json data into pandas dataframe

In [19]:
def convert_sreality_data_to_df(sreality_data):
    return pd.DataFrame(sreality_data['_embedded']['estates'])

raw = convert_sreality_data_to_df(d)

### 1c. link function `1b` into function `1a`

In [24]:
def request_sreality(page, category_main_str, category_type_str, locality_region_id=10):
    category_mains = {'flat':1, 'house':2, 'land':3 }
    category_types = {'sell':1,'rent':2}
    template_url = 'https://www.sreality.cz/api/cs/v2/estates?category_main_cb={category_main}&category_type_cb={category_type}&locality_region_id={locality_region_id}&per_page60&page={page}'
    
    request_url = template_url.format(
        category_main=category_mains[category_main_str],
        category_type=category_types[category_type_str],
        locality_region_id=locality_region_id,
        page=page
    )
    
    try: 
        r = requests.get(request_url)
        
        return convert_sreality_data_to_df(r.json())
    except Exception as e:
        print(f'error requesting url {request_url}. Reason: {e.message}')
    
df = request_sreality(0, 'flat', 'sell', 10)
df.head()

Unnamed: 0,labelsReleased,has_panorama,labels,is_auction,labelsAll,seo,exclusively_at_rk,category,has_floor_plan,_embedded,...,hash_id,attractive_offer,price,price_czk,_links,rus,name,region_tip,gps,has_matterport_url
0,"[[new_building, terrace, garage], [metro]]",0,"[Novostavba, Terasa, Garáž, Metro 8 min. pěšky]",False,"[[new_building, personal, terrace, brick, gara...","{'category_main_cb': 1, 'category_sub_cb': 8, ...",0,1,1,"{'favourite': {'is_favourite': False, '_links'...",...,4061265500,0,14708000,"{'value_raw': 14708000, 'unit': '', 'name': 'C...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 4+kk 147 m²,2118035,"{'lat': 50.094820779133876, 'lon': 14.57653422...",False
1,"[[not_furnished], [medic]]",0,"[Nevybavený, Lékař 6 min. pěšky]",False,"[[personal, brick, not_furnished], [small_shop...","{'category_main_cb': 1, 'category_sub_cb': 5, ...",1,1,1,"{'favourite': {'is_favourite': False, '_links'...",...,1408231772,0,8608000,"{'value_raw': 8608000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 2+1 80 m²,0,"{'lat': 50.05636877913388, 'lon': 14.461053220...",False
2,"[[after_reconstruction], [shop]]",0,"[Po rekonstrukci, Obchod 6 min. pěšky]",False,"[[personal, after_reconstruction, brick], [pla...","{'category_main_cb': 1, 'category_sub_cb': 4, ...",0,1,0,"{'favourite': {'is_favourite': False, '_links'...",...,1474811468,0,4750000,"{'value_raw': 4750000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 2+kk 38 m²,0,"{'lat': 50.04431377913388, 'lon': 14.526074220...",False
3,"[[], []]",0,[],False,"[[personal, brick, elevator], [theater, candy_...","{'category_main_cb': 1, 'category_sub_cb': 12,...",1,1,0,"{'favourite': {'is_favourite': False, '_links'...",...,1979790924,0,0,"{'value_raw': 0, 'unit': '', 'name': 'Celková ...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 6 pokojů a více 235 m²,0,"{'lat': 50.064558779133876, 'lon': 14.44895722...",False
4,"[[new_building, furnished], []]",0,"[Novostavba, Vybavený]",False,"[[new_building, personal, brick, cellar, eleva...","{'category_main_cb': 1, 'category_sub_cb': 2, ...",0,1,0,"{'favourite': {'is_favourite': False, '_links'...",...,1608513356,0,6374000,"{'value_raw': 6374000, 'unit': '', 'name': 'Ce...",{'dynamicDown': [{'href': 'https://d18-a.sdn.c...,False,Prodej bytu 1+kk 43 m²,0,"{'lat': 50.072252779133876, 'lon': 14.47782722...",False


### 1c. Combining multiple requests into single df

* Function should parametrize:
    * `start_page` and `end_page`
    * request parameters
* construct a list of individual request dfs
* then feed it into `pd.concat` function

In [26]:
raw.shape

(21, 27)

In [30]:
def request_multiply_sreality(start_page, end_page, category_main_str, category_type_str, locality_region_id=10):
    pages = range(start_page, end_page + 1)
    list_of_dfs = [request_sreality(page, category_main_str, category_type_str, locality_region_id) for page in pages]
    return pd.concat(list_of_dfs)
df = request_multiply_sreality(1, 5, 'flat', 'sell',10)
df.shape

(105, 27)

## Task 2: Cleaning data

### 2a. Filter columns
* filter only columns: `['locality', 'price', 'name', 'gps','hash_id','exclusively_at_rk']`
* use `.copy()` to avoid `SettingWithCopyWarning` later


In [49]:
clean = df[['locality', 'price', 'name', 'gps','hash_id','exclusively_at_rk']].copy()
clean

Unnamed: 0,locality,price,name,gps,hash_id,exclusively_at_rk
0,Praha 3 - Žižkov,12507000,Prodej bytu 2+kk 80 m²,"{'lat': 50.07360277913388, 'lon': 14.456816220...",2129008460,0
1,Praha 6 - Dejvice,8390000,Prodej bytu 1+kk 43 m²,"{'lat': 50.08987277913388, 'lon': 14.400153220...",1920355164,0
2,Praha 2 - Vinohrady,6673000,Prodej bytu 1+kk 76 m²,"{'lat': 50.058195779133875, 'lon': 14.45000922...",1022909276,0
3,Praha 4 - Chodov,4903000,Prodej bytu 2+kk 46 m²,"{'lat': 50.01318577913388, 'lon': 14.527232220...",51414604,0
4,Praha 10 - Strašnice,8165000,Prodej bytu 3+kk 56 m²,"{'lat': 50.06486377913388, 'lon': 14.502033220...",3834866012,0
...,...,...,...,...,...,...
16,Praha 4 - Nusle,4226000,Prodej bytu 1+kk 41 m²,"{'lat': 50.048776779133874, 'lon': 14.44859422...",1534785356,1
17,Praha 9 - Letňany,4799000,Prodej bytu 1+kk 35 m²,"{'lat': 50.131330779133876, 'lon': 14.51879522...",1549387340,0
18,Praha 3 - Žižkov,5711000,Prodej bytu 2+1 56 m²,"{'lat': 50.07895177913388, 'lon': 14.506957220...",2233935436,1
19,Praha 10 - Strašnice,9597000,Prodej bytu 3+1 81 m²,"{'lat': 50.066801779133876, 'lon': 14.52814522...",2689935180,1


In [63]:

clean['flat_type'] = clean.name.apply(lambda nm:nm.split()[2])


Unnamed: 0,locality,price,name,gps,hash_id,exclusively_at_rk,flat_type
0,Praha 3 - Žižkov,12507000,Prodej bytu 2+kk 80 m²,"{'lat': 50.07360277913388, 'lon': 14.456816220...",2129008460,0,2+kk
1,Praha 6 - Dejvice,8390000,Prodej bytu 1+kk 43 m²,"{'lat': 50.08987277913388, 'lon': 14.400153220...",1920355164,0,1+kk
2,Praha 2 - Vinohrady,6673000,Prodej bytu 1+kk 76 m²,"{'lat': 50.058195779133875, 'lon': 14.45000922...",1022909276,0,1+kk
3,Praha 4 - Chodov,4903000,Prodej bytu 2+kk 46 m²,"{'lat': 50.01318577913388, 'lon': 14.527232220...",51414604,0,2+kk
4,Praha 10 - Strašnice,8165000,Prodej bytu 3+kk 56 m²,"{'lat': 50.06486377913388, 'lon': 14.502033220...",3834866012,0,3+kk
...,...,...,...,...,...,...,...
16,Praha 4 - Nusle,4226000,Prodej bytu 1+kk 41 m²,"{'lat': 50.048776779133874, 'lon': 14.44859422...",1534785356,1,1+kk
17,Praha 9 - Letňany,4799000,Prodej bytu 1+kk 35 m²,"{'lat': 50.131330779133876, 'lon': 14.51879522...",1549387340,0,1+kk
18,Praha 3 - Žižkov,5711000,Prodej bytu 2+1 56 m²,"{'lat': 50.07895177913388, 'lon': 14.506957220...",2233935436,1,2+1
19,Praha 10 - Strašnice,9597000,Prodej bytu 3+1 81 m²,"{'lat': 50.066801779133876, 'lon': 14.52814522...",2689935180,1,3+1


In [67]:
clean.name

0     Prodej bytu 2+kk 80 m²
1     Prodej bytu 1+kk 43 m²
2     Prodej bytu 1+kk 76 m²
3     Prodej bytu 2+kk 46 m²
4     Prodej bytu 3+kk 56 m²
               ...          
16    Prodej bytu 1+kk 41 m²
17    Prodej bytu 1+kk 35 m²
18     Prodej bytu 2+1 56 m²
19     Prodej bytu 3+1 81 m²
20    Prodej bytu 2+kk 44 m²
Name: name, Length: 105, dtype: object

In [69]:
clean.name.apply(lambda nm:nm.split()[3]).

76     6
56     4
57     4
47     4
59     3
      ..
221    1
65     1
96     1
40     1
51     1
Name: name, Length: 66, dtype: int64

### 2b: GPS
* Convert dictionary in `gps` column into two columns - `lat` and `lon`
* use apply function on gps column
* Note apply can return multiple columns

### 2b. Get flat type from name
* Name is always represented by string `Prodej bytu [type of flat] [Area] m^2`
* try picking third word in string
* check meaningfulness using `.value_counts()`

### 2c. Get area from name
* Naive: select the word before last word
* Then try navigating using the index of `'m²'`
* if this also fail, then you will need to use regex

## Task 3: Persist to sqlite

## Task 4 (Homework): Convert `labelsAll` into categorical variables

### Task 4a. Get all possible label names
* deal with nested-list structure
* Hint: try to sum the whole column
* Needed to Iterate through all labels in all rows and 

### 4b. Test existence of label `cellar` for offers
* again deal with nested list of list structure
* write generic function `test_existence_of_label(offer_labels,label)`

### 4c. Test existence of all possible labels
* use apply returning series with all labels