In [1]:
import requests
import pandas as pd
import time

# Seminar - APIs, DBs and Live coding

## Task 1: Requesting API
### 1a. Create a function requesting data from sreality


```python
base_url = 'https://www.sreality.cz/api/cs/v2/estates?category_main_cb=1&category_type_cb=1&locality_region_id=10&per_page60&page={}'.format(i)

r = requests.get(base_url)
d = r.json()
```

* function should parametrize: 
    * `category_main_cb` - `{'flat':1, 'house':2, 'land':3 }`
    * `category_type_cb` - `{'sell':1,'rent':2}`
    * `locality_region_id` - use 10 as default value
    * `page` parameter
* use string inputs for `category_main_cb` and `category_type_cb`
* include try/except clause to handle errors
* function should return JSON data in python types
* do not forget to sleep each request at least 0.5s

In [8]:
def request_sreality(page, category_main_str, category_type_str, locality_region_id=10):
    category_mains = {'flat':1, 'house':2, 'land':3 }
    category_types = {'sell':1,'rent':2}
    template_url = 'https://www.sreality.cz/api/cs/v2/estates?category_main_cb={category_main}&category_type_cb={category_type}&locality_region_id={locality_region_id}&per_page60&page={page}'
    request_url = template_url.format(
        category_main=category_mains[category_main_str],
        category_type=category_types[category_type_str],
        locality_region_id=locality_region_id,
        page=page
    )
    r = requests.get(request_url)
    return r.json()
d = request_sreality(0, 'flat', 'sell', 10)

### 1b. Create a function converting sreality json data into pandas dataframe

In [16]:
def convert_sreality_data_to_df(sreality_data):
    return

raw = convert_sreality_data_to_df(d)

In [None]:
raw.head()

### 1c. link function `1b` into function `1a`

In [None]:
df = request_sreality(0, 'flat', 'sell', 10)
df.head()

### 1c. Combining multiple requests into single df

* Function should parametrize:
    * `start_page` and `end_page`
    * request parameters
* construct a list of individual request dfs
* then feed it into `pd.concat` function

In [26]:
raw.shape

(21, 27)

In [44]:
request_sreality

<function __main__.request_sreality(page, category_main_str, category_type_str, locality_region_id=10)>

In [45]:
def request_multiply_sreality(start_page, end_page, category_main_str, category_type_str, locality_region_id=10):
    
    return pd.concat(list_of_dfs)

df = request_multiply_sreality(1, 5, 'flat', 'sell',10)
df.shape

(103, 27)

## Task 2: Cleaning data

### 2a. Filter columns
* filter only columns: `['locality', 'price', 'name', 'gps','hash_id','exclusively_at_rk']`
* use `.copy()` to avoid `SettingWithCopyWarning` later


### 2b: GPS
* Convert dictionary in `gps` column into two columns - `lat` and `lon`
* use apply function on gps column
* Note apply can return multiple columns

### 2b. Get flat type from name
* Name is always represented by string `Prodej bytu [type of flat] [Area] m^2`
* try picking third word in string
* check meaningfulness using `.value_counts()`

### 2c. Get area from name
* Naive: select the word before last word
* Then try navigating using the index of `'m²'`
* if this also fail, then you will need to use regex

In [115]:
clean[clean['area_1']==clean['area_2']]

Unnamed: 0,locality,price,name,gps,hash_id,exclusively_at_rk,lat1,lon1,lat,lon,flat_type,area,area_1,area_2


In [114]:
clean[clean['area_1'].astype(int)==clean['area_2']]

Unnamed: 0,locality,price,name,gps,hash_id,exclusively_at_rk,lat1,lon1,lat,lon,flat_type,area,area_1,area_2
0,Praha 9 - Kbely,12862000,Prodej bytu 4+kk 128 m²,"{'lat': 50.12603618747833, 'lon': 14.561554812...",58234188,0,50.126036,14.561555,50.126036,14.561555,"[Prodej, bytu, 4+kk, 128, m²]",128,128,128
1,Praha 2 - Vinohrady,21566000,Prodej bytu 2+kk 126 m²,"{'lat': 50.06495918747833, 'lon': 14.454340812...",4107789900,0,50.064959,14.454341,50.064959,14.454341,"[Prodej, bytu, 2+kk, 126, m²]",126,126,126
2,Praha 5 - Sobín,17382000,Prodej bytu 3+kk 97 m²,"{'lat': 50.052054187478326, 'lon': 14.28598081...",1972872524,1,50.052054,14.285981,50.052054,14.285981,"[Prodej, bytu, 3+kk, 97, m²]",97,97,97
3,Praha 5 - Stodůlky,18286000,Prodej bytu 4+kk 122 m²,"{'lat': 50.02775118747833, 'lon': 14.324684812...",866350924,0,50.027751,14.324685,50.027751,14.324685,"[Prodej, bytu, 4+kk, 122, m²]",122,122,122
4,Praha 5 - Stodůlky,14140000,Prodej bytu 3+kk 88 m²,"{'lat': 50.02775118747833, 'lon': 14.324684812...",3735254860,0,50.027751,14.324685,50.027751,14.324685,"[Prodej, bytu, 3+kk, 88, m²]",88,88,88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,Praha 8 - Karlín,10236000,Prodej bytu 1+kk 60 m²,"{'lat': 50.08081318747833, 'lon': 14.459052812...",1918002252,0,50.080813,14.459053,50.080813,14.459053,"[Prodej, bytu, 1+kk, 60, m²]",60,60,60
99,Praha 4 - Michle,29614000,Prodej bytu 3+kk 272 m²,"{'lat': 50.03685218747833, 'lon': 14.467224812...",2810619212,1,50.036852,14.467225,50.036852,14.467225,"[Prodej, bytu, 3+kk, 272, m²]",272,272,272
100,Praha 4 - Modřany,14018000,Prodej bytu 3+kk 100 m²,"{'lat': 49.989115187478326, 'lon': 14.41775681...",1567020876,0,49.989115,14.417757,49.989115,14.417757,"[Prodej, bytu, 3+kk, 100, m²]",100,100,100
101,Praha 9 - Kbely,11121000,Prodej bytu 3+kk 88 m²,"{'lat': 50.11815518747833, 'lon': 14.550433812...",1684042828,0,50.118155,14.550434,50.118155,14.550434,"[Prodej, bytu, 3+kk, 88, m²]",88,88,88


## Bonus: Convert `labelsAll` into categorical variables

### Task 4a. Get all possible label names
* deal with nested-list structure
* Hint: try to sum the whole column
* Needed to Iterate through all labels in all rows and 

In [141]:
['d'] + ['c']

['d', 'c']

In [152]:
possible_labels = list(set([i for sublist in raw.labelsAll.sum() for i in sublist]))
possible_labels

['natural_attraction',
 'kindergarten',
 'tram',
 'movies',
 'cellar',
 'brick',
 'candy_shop',
 'train',
 'metro',
 'bus_public_transport',
 'playground',
 'personal',
 'tavern',
 'loggia',
 'elevator',
 'school',
 'small_shop',
 'parking_lots',
 'partly_furnished',
 'new_building',
 'vet',
 'theater',
 'balcony',
 'not_furnished',
 'shop',
 'medic',
 'post_office',
 'sightseeing',
 'restaurant',
 'in_construction',
 'atm',
 'sports',
 'garage',
 'drugstore']

### 4b. Test existence of label `cellar` for offers
* again deal with nested list of list structure
* write generic function `test_existence_of_label(offer_labels,label)`

In [163]:
def test_existence_of_label(offer_labels,label):
    return 'cellar' in [item for sublist in offer_labels for item in sublist]

raw.labelsAll.apply(lambda offer_labels: test_existence_of_label(offer_labels, 'cellar'))

0     False
1     False
2      True
3      True
4     False
5      True
6      True
7     False
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20     True
Name: labelsAll, dtype: bool

### 4c. Test existence of all possible labels
* use apply returning series with all labels

In [164]:
def existence_of_all_labels(offer_labels, possible_labels):
    return pd.Series({
        label:test_existence_of_label(offer_labels,label)
        for label in possible_labels
    })

raw.labelsAll.apply(lambda offer_labels: existence_of_all_labels(offer_labels, possible_labels))

Unnamed: 0,natural_attraction,kindergarten,tram,movies,cellar,brick,candy_shop,train,metro,bus_public_transport,...,shop,medic,post_office,sightseeing,restaurant,in_construction,atm,sports,garage,drugstore
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
6,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
9,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
