In [None]:
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
from numpy import char as npc
import requests
from bs4 import BeautifulSoup
import scipy.stats
import matplotlib.pyplot as plt
import re
import os
import math
from datetime import datetime
%matplotlib inline

import seaborn as sns
sns.set()

import json

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%load_ext autoreload
%autoreload 2

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')

The following jupyter notebook illustrate the step tooken to scrape required real estate offers from [tutti.ch](https://www.tutti.ch/fr/li/vaud/lausanne/immobilier?o=1).

For our work we only need the annoncements of real estate in the area of Lausanne.

The final dataframe is saved as a CSV file named 'tutti.df'.

In [None]:
DIR = 'tutti/'

### 1. Go through all pages and save it locally

The first step is go through all the available pages and save the response JSON locally.

In [None]:
def getTutti(page, limit=100):
    cookies = {
        'ajs_user_id': 'null',
        'ajs_group_id': 'null',
        'lang': 'fr',
        'ajs_anonymous_id': '%22bf0a3c34-c83f-4764-a48f-c783309a9f4c%22',
        '_gcl_au': '1.1.253718592.1542968730',
        '_ga': 'GA1.2.1357180133.1542968730',
        '_gid': 'GA1.2.2109516043.1542968730',
        'adw': '12a92c4c-f201-44a3-a5ce-26dbbce57a94',
        '_gat_UA-88671020-1': '1',
    }
    headers = {
        'AlexaToolbar-ALX_NS_PH': 'AlexaToolbar/alx-4.0.3',
        'Origin': 'https://www.tutti.ch',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'fr',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
        'Accept': 'application/json, text/plain, */*',
        'Referer': 'https://www.tutti.ch/fr/li/vaud/lausanne/immobilier/appartements?o=2',
        'Connection': 'keep-alive',
        'X-Tutti-Hash': '81fd68a0-47a0-4b8b-9401-4c63cb416c5e',
        'X-Tutti-Source': 'web LIVE-181123-163',
        'DNT': '1',
    }
    params = (
        ('category', '1000'),
        ('limit', limit),
        ('m', '131'),
        ('o', page),
        ('region', '20'), # Region 20: lausanne
        ('subcategory', ''),
        ('with_all_regions', 'false'),
    )

    response = requests.get('https://api.tutti.ch/v10/list.json', headers=headers, params=params, cookies=cookies)
    return json.loads(response.content)

In [None]:
def saveJson(data, filename):
    with open(filename, 'w') as outfile:
        json.dump(data, outfile)

In [None]:
def loadJson(filename):
    with open(filename, 'r') as outfile:
        return json.load(outfile)

Get all tutti.ch advertisment from region 20

In [None]:
# Compute number of pages to iterate
limit = 100
total_ads = getTutti(page=1, limit=limit)['search_total']
pages = math.ceil(total_ads/limit)

# Iterate through all pages in region=20 (Lausanne) and
for page in np.arange(1, pages+1):
    j = getTutti(page=page, limit=limit)
    filename = DIR + 'page_' + str(page) + '.json'
    saveJson(j, filename)

At this point there are *pages* different JSON file that need to be read, merged and cleaned.

### 2. Open, merge and clean

Iterate over all *json* in dir and save it on a JSON. 

The `json_normalize` function turned out to be very useful to flatten all JSON cells. 

In [None]:
all_json_filename = os.listdir(DIR)
# Take only .json file
all_json_filename = list(filter(lambda x: x.endswith('.json'), all_json_filename))

all_json = []

for j_ in all_json_filename:
    print(j_)
    j_tmp = loadJson(DIR + j_)
    
    # Read JSON (and flatten values)
    pd_ = json_normalize(j_tmp['items'])
    
    # Save pandas obj 
    all_json.append(pd_)
    
df = pd.concat(all_json, ignore_index=True ,sort=False)

In [None]:
df.head(2)

In [None]:
df.shape

The DataFrame df still contains a column 'parameters' that is not an easy parsable JSON object. Get rid of it in a dirty way:

In [None]:
def posIdInParameter(id_, p):
    """Return position of the list where the id is contained"""
    pos = -1
    values = [value['id'] for value in p]
    zip_ = dict(zip(values, np.arange(0,len(p))))
    if (id_ in values):
        pos = zip_[id_]
    return pos

In [None]:
def cleanParameter(p):
    """From parameter objects p, return Pandas Series with only nmr. of rooms, size and type"""
    
    rooms, size, type_ = [np.nan] * 3
    
    # Check for rooms
    pos_rooms = posIdInParameter('rooms', p)
    if (pos_rooms != -1):
        rooms = p[pos_rooms]['value']
        
    # Check for size
    pos_size = posIdInParameter('size', p)
    if (pos_size != -1):
        size = p[pos_size]['value']
        
    # Check for type_
    pos_type = posIdInParameter('type', p)
    if (pos_type != -1):
        type_ = p[pos_type]['value']
    
    return pd.Series([rooms, size, type_])

In [None]:
parameters = df['parameters'].apply(lambda p: cleanParameter(p))
parameters.columns = ['rooms', 'size', 'type_param']

In [None]:
df = df.merge(parameters, left_index=True, right_index=True)


# Drop not useful columns
not_useful_columns = ['company_ad', 
                      'image_names', 
                      'language', 
                      'parameters', 
                      'phone_hash', 
                      'thumb_name', 
                      'category_info.id',
                      'category_info.parent_id',
                      'category_info.parent_name',
                      'highlight',
                      'location_info.area', # since all 'lausanne',
                      'location_info.area_id',
                      'location_info.region_name',
                      'location_info.region_id',
                      'public_account_id',
                     ]
df.drop(not_useful_columns, axis='columns', inplace=True)

df.head(2)

In [None]:
df.to_csv('tutti.csv')

In [None]:
df2 = pd.read_csv('tutti.csv')
df2.shape
df2.head(2)

### 3. Data analysis ....

In [None]:
np.sum(df2['location_info.address'].isna())