In [1]:
import boto3
import bson
from bson.json_util import dumps, loads
import json
import os
from PIL import Image
import PIL
from io import BytesIO
import pandas as pd
import geopandas
import numpy as np
import feather
import h5py

pd.options.display.max_columns=999
pd.options.display.max_rows = 999

In [2]:
# import helper 
import sys
sys.path.append("/Users/xszpo/Google Drive/DataScience/Projects/201907_xFlat_AWS_Scrapy")
import helpers

# run scrapy settings
%run /Users/xszpo/Google\ Drive/DataScience/Projects/201907_xFlat_AWS_Scrapy/scraper/settings.py

In [3]:
LOCAL_DATA_PATH

'/Users/xszpo/Google Drive/DataScience/DATA/01_otodom_scrapy'

# Load raw data

In [4]:
file_list = [i for i in os.listdir(LOCAL_DATA_PATH) if i.endswith(".bson")]

In [5]:
len(file_list)

15821

# Load Data Frame

In [6]:
# load data frame

def load_data(file_name_list, file_path):
    
    _tmp_list = [None for i in range(len(file_name_list))]
    
    for i,file in enumerate(file_name_list):
        
        # LOAD DATA
        _tmp = helpers.scraper.read_bson_local(file_path, file)
        _tmp = helpers.scraper.dict_except(_tmp,['img_gallery_strimg','gallery'])
        
        # PUT DATA INTO COLUMNS
        _tmp['file_name'] = file
        _tmp['GC_latitude'] = _tmp['geo_coordinates']['latitude']
        _tmp['GC_llongitude'] = _tmp['geo_coordinates']['longitude']
        _tmp['GC_boundingbox'] = _tmp['geo_address_coordin']['@boundingbox']
        _tmp['GC_addr_house_number'] = _tmp['geo_address_text']['house_number'] if 'house_number' in _tmp['geo_address_text'] else None
        _tmp['GC_addr_road'] = _tmp['geo_address_text']['road'] if 'road' in _tmp['geo_address_text'] else None
        _tmp['GC_addr_neighbourhood'] = _tmp['geo_address_text']['neighbourhood'] if 'neighbourhood' in _tmp['geo_address_text'] else None
        _tmp['GC_addr_suburb'] = _tmp['geo_address_text']['suburb'] if 'suburb' in _tmp['geo_address_text'] else None
        _tmp['GC_addr_city'] = _tmp['geo_address_text']['city'] if 'city' in _tmp['geo_address_text'] else None
        _tmp['GC_addr_county'] = _tmp['geo_address_text']['county'] if 'country' in _tmp['geo_address_text'] else None
        _tmp['GC_addr_state'] = _tmp['geo_address_text']['state'] if 'state' in _tmp['geo_address_text'] else None
        _tmp['GC_addr_postcode'] = _tmp['geo_address_text']['postcode'] if 'postcode' in _tmp['geo_address_text'] else None
        _tmp['GC_addr_country'] = _tmp['geo_address_text']['country'] if 'country' in _tmp['geo_address_text'] else None
        _tmp['GC_addr_country_code'] = _tmp['geo_address_text']['country_code'] if 'country_code' in _tmp['geo_address_text'] else None
        _ = _tmp.pop('geo_coordinates')
        _ = _tmp.pop('geo_address_coordin')
        _ = _tmp.pop('price_per_square')

        
        # MODIFY DATA
        _tmp['flat_size'] = helpers.scraper.digits_from_str(_tmp['flat_size']) if _tmp['flat_size'] is not None else None
        _tmp['price'] = helpers.scraper.digits_from_str(_tmp['price']) if _tmp['price'] is not None else None
        _tmp['price_m2'] = helpers.scraper.digits_from_str(_tmp['price_m2']) if _tmp['price_m2'] is not None else None
        _tmp['rooms'] = int(helpers.scraper.digits_from_str(_tmp['rooms'])) if _tmp['rooms'] is not None else None
        _tmp['floor_attic'] = 1 if _tmp['floor']=='poddasze' else 0
        _tmp['floor_basement'] = 1 if _tmp['floor']=='suterena' else 0
        _tmp['floor'] = np.float32(helpers.scraper.convert_floor(_tmp['floor'])) if isinstance(_tmp['floor'], (str)) else None
        _tmp['number_of_floors'] = np.float32(_tmp['number_of_floors']) if isinstance(_tmp['number_of_floors'], (str)) else None
        _tmp['year_of_building'] = np.float32(_tmp['year_of_building']) if isinstance(_tmp['year_of_building'], (str)) else None
        _tmp['rent_price'] = helpers.scraper.digits_from_str(_tmp['rent_price']) if _tmp['rent_price'] is not None else None
        
        # SAVE TO LIST
        _tmp_list[i] = _tmp
        
        columns = ['offer_id','tracking_id','name','location','flat_size', 'rooms','floor','price',
                'price_m2', 'market', 'number_of_floors', 'floor_attic','floor_basement','building_type', 
                'building_material', 'widows_type', 'heating_type', 'year_of_building','finishing_stage', 
                'rent_price', 'property_form', 'available_from','description','additional_info',
                'GC_latitude','GC_llongitude', 'GC_boundingbox', 'GC_addr_house_number','GC_addr_road', 
                'GC_addr_neighbourhood', 'GC_addr_suburb','GC_addr_city', 'GC_addr_county', 'GC_addr_state', 
                'GC_addr_postcode','GC_addr_country', 'GC_addr_country_code','url','main_url']
    
    return pd.DataFrame(_tmp_list)[columns]


In [7]:
data_df = load_data(file_list,LOCAL_DATA_PATH)


In [8]:
feather.write_dataframe(data_df, os.path.join(LOCAL_DATA_PATH_PREP,'train_data.feather'))

# Load Images

In [9]:
from keras.preprocessing import image
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from keras import backend as K
import numpy as np

Using TensorFlow backend.


In [10]:
%%time

def load_data(file_name_list, file_path, img_size = (224, 224)):
    
    _tmp_list = [None for i in range(len(file_name_list))]
    
    for i,file in enumerate(file_name_list):

        # LOAD DATA
        _tmp = helpers.scraper.read_bson_local(file_path, file)
        _tmp = helpers.scraper.dict_except(_tmp,include_keys=['price','img_gallery_strimg'])
        
        # PUT DATA INTO COLUMNS
        _tmp['file_name'] = file
        _tmp['price'] = helpers.scraper.digits_from_str(_tmp['price']) if _tmp['price'] is not None else None
        _tmp['photo_1'] = helpers.scraper.open_img_from_str(_tmp['img_gallery_strimg'][0]).resize(img_size) if len(_tmp['img_gallery_strimg'])>0 else None
        _tmp['photo_2'] = helpers.scraper.open_img_from_str(_tmp['img_gallery_strimg'][1]).resize(img_size) if len(_tmp['img_gallery_strimg'])>1 is not None else None
        _tmp['photo_3'] = helpers.scraper.open_img_from_str(_tmp['img_gallery_strimg'][2]).resize(img_size) if len(_tmp['img_gallery_strimg'])>2 is not None else None
        _ = _tmp.pop('img_gallery_strimg')
        
        # SAVE TO LIST
        _tmp_list[i] = _tmp
        
    return _tmp_list

photo = load_data(file_list,LOCAL_DATA_PATH)


CPU times: user 2min 13s, sys: 10.7 s, total: 2min 24s
Wall time: 2min 37s


In [11]:
name = []
price = []

for i in photo:
    name += [i['file_name']]
    price += [i['price']]
    

In [12]:
name = np.array(name).reshape(-1,1)
price = np.array(price).reshape(-1,1)

In [13]:
%%time

array_photo_1 = []
array_photo_2 = []
array_photo_3 = []

for i in photo:
    array_photo_1 += [image.img_to_array(i['photo_1']) if i['photo_1'] is not None else np.zeros((224, 224, 3))]
    array_photo_2 += [image.img_to_array(i['photo_2']) if i['photo_2'] is not None else np.zeros((224, 224, 3))]
    array_photo_3 += [image.img_to_array(i['photo_3']) if i['photo_3'] is not None else np.zeros((224, 224, 3))]


CPU times: user 28.1 s, sys: 40.6 s, total: 1min 8s
Wall time: 2min 7s


In [14]:
import gc

del(photo)
gc.collect()


80

In [15]:
array_photo_1 = np.concatenate(array_photo_1).reshape(-1,224,224,3)

In [16]:
array_photo_2 = np.concatenate(array_photo_2).reshape(-1,224,224,3)

In [17]:
array_photo_3 = np.concatenate(array_photo_3).reshape(-1,224,224,3)

In [18]:
import gc
gc.collect()

80

https://keras.io/getting-started/faq/#how-can-i-use-hdf5-inputs-with-keras    
http://tdeboissiere.github.io/h5py-vs-npz.html

%%time

with h5py.File(os.path.join(LOCAL_DATA_PATH_PREP,'array_photo.h5'), "w") as hf:
    dset = hf.create_dataset("array_photo_1", data=array_photo_1, compression="gzip")
    dset = hf.create_dataset("array_photo_2", data=array_photo_2, compression="gzip")
    dset = hf.create_dataset("array_photo_3", data=array_photo_3, compression="gzip")
    dset = hf.create_dataset("name", data=name, compression="gzip", compression_opts=9)
    dset = hf.create_dataset("price", data=price, compression="gzip", compression_opts=9)

In [None]:
np.savez_compressed(os.path.join(LOCAL_DATA_PATH_PREP,'array_photo'), 
                    a1=array_photo_1, a2=array_photo_2, a3=array_photo_3)