In [7]:
import os
import numpy as np
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
from IPython.core.display import display
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager


# PUBLIC TRANSPORTATION DATA SET DOWNLOAD
By launching this script, the https://www.citylines.co/data website is scraped and the last updated version of the transports json data sets are downloaded. A still-in-work part will allow to merge all the downloaded datasets to a more useful unique .csv file.

In [None]:
##########################################
### MODIFY THIS, to store the DataSet: ###
##########################################

localDataUrl = '/Volumes/Disk2/Courses MA3/MA3 - ADA/AIRBNB data/Transports'

##########################################
##########################################

url = 'https://www.citylines.co/data'

# get all the files names and the files urls
files_df = scrapeCityLines(url)
# download the data set
download_data(files_df)


In [3]:
def scrapeCityLines(url):
    # Selenium is able to simulate the browser, and so we can make it wait until 
    # the page finished loading before we are getting the data.
    driver   = webdriver.Chrome(ChromeDriverManager().install())
    response = driver.get(url)
    select   = Select(driver.find_element_by_class_name('c-field'))

    links_download = []
    files_names    = []
    idx            = 0

    for cities in select.options:

        # first element of the dropdown menu is Select City; this is not a city!!!

        if idx >= 1:
            opt   = cities.click()
            soup  = BeautifulSoup(driver.page_source)
            links = soup.find_all('a', class_ = 'c-link')

            for i in links:
                if i.has_attr('href') and ('/api/' in i['href']):
                    links_download.append('https://www.citylines.co' + i['href'])
                    files_names.append(i['download'])
        idx += 1
    
    driver.quit()

    files_df = pd.DataFrame({'file_name':files_names, 'link':links_download})
    
    return files_df

def download_data(files_df):
    # download all the dataset
    for index, row in files_df.iterrows():

            file_name = localDataUrl + '/' + row['file_name']
            print('Downloading... ' + str(index+1) +'/' + str(files_df.shape[0]) + ' : ' + file_name)

            urllib.request.urlretrieve(row['link'], file_name)

In [None]:
# WORK in progress....
""""
from pandas.io.json import json_normalize 
import json

def mergeFiles(file_name, files_folder_path, main_node):
    
    
    all_files = os.listdir(files_folder_path)
    selected_files = [i for i in all_files if file_name in i]
    
    df_merged = pd.DataFrame()
    
    for fl in selected_files:
        print('Merging file... ' + fl)
        with open(files_folder_path + '/' + fl) as data_file:    
            data = json.load(data_file)
        df = pd.DataFrame()
    
        for i in data[main_node]:
            df = pd.concat([df, json_normalize(i)], sort = True)

        df = df.reset_index(drop = True)

        # further expand all left dict in columns
        cols_with_dict = []
        for j in df.columns:
            hasDict = "<class 'dict'>" in df[j].apply(lambda x: str(type(x[0])) if hasattr(x, '__len__') and len(x)> 0 else x).tolist()
            cols_with_dict.append(hasDict)

        for a, b in zip(cols_with_dict, df.columns):
            if a:
                dicts = [x[0] if len(x)>0 else {} for x in df[b]]
                df = pd.concat([df, json_normalize(records)], axis = 1, sort = True).drop(b, axis = 1)

        df['city'] = fl.split('_')[0]
        df.columns = [s.split('.')[-1] for s in df.columns.values]
        
        
        dfs = [df, df_merged]
        if df_merged.shape[0] > 0:
            df_merged = pd.concat(dfs, ignore_index=True)
        else:
            df_merged = df
        #df_merged = pd.concat([df, df_merged], sort = True, ignore_index = True)
        
    
    return df_merged



df_stations = mergeFiles('stations.geojson', files_folder_path = localDataUrl, main_node = 'features')
df_stations
"""

# Nested json files handling
Here, the Amsterdam nested json stations is converted to a pandas DataFrame. The goal will be to automatize this and do it for all the downloaded data sets. The above code is intended to do it but is still to debug. At the end the goal is to have an unique .csv file comprising all the stations for all the cities

In [8]:
#### TESTS
import json
from pandas.io.json import json_normalize 

with open('/Volumes/Disk2/Courses MA3/MA3 - ADA/AIRBNB data/Transports/amsterdam_stations.geojson') as data_file:    
            data = json.load(data_file)

df = pd.DataFrame()
for i in data['features']:
    df = pd.concat([df, json_normalize(i)], sort = True)
    
df = df.reset_index(drop = True)
    



cols_with_dict = []
for i in df.columns:
    hasDict = "<class 'dict'>" in df[i].apply(lambda x: str(type(x[0])) if hasattr(x, '__len__') and len(x)> 0 else x).tolist()
    cols_with_dict.append(hasDict)

for a, b in zip(cols_with_dict, df.columns):
    if a:
        dicts = [x[0] if len(x)>0 else {} for x in df[b]]
        df = pd.concat([df, json_normalize(dicts)], axis = 1, sort = True).drop(b, axis = 1)

df.columns = [s.split('.')[-1] for s in df.columns.values]  

df.to_csv('Amsterdam_stations.csv')

Unnamed: 0,coordinates,type,buildstart,closure,id,klass,name,opening,osm_id,osm_metadata,osm_tags,type.1,line,line_url_name,system
0,"[4.83461, 52.34597]",Point,1992,999999,17907,Station,Henk Sneevlietweg,1997,,,,Feature,51.0,2074-51,GVB Metro
1,"[4.85758, 52.3384]",Point,1992,999999,17908,Station,Amstelveenseweg,1997,,,,Feature,51.0,2074-51,GVB Metro
2,"[4.87374, 52.33918]",Point,1992,999999,17909,Station,Zuid,1997,,,,Feature,52.0,2075-52,GVB Metro
3,"[4.85033, 52.39516]",Point,1992,999999,17901,Station,Isorlatorweg,1997,,,,Feature,51.0,2074-51,GVB Metro
4,"[4.83892, 52.38905]",Point,1992,999999,17900,Station,Sloterdijk,1997,,,,Feature,51.0,2074-51,GVB Metro
5,"[4.83817, 52.37936]",Point,1992,999999,17902,Station,De Vlughtlaan,1997,,,,Feature,51.0,2074-51,GVB Metro
6,"[4.83446, 52.35784]",Point,1992,999999,17905,Station,Lelylaan,1997,,,,Feature,51.0,2074-51,GVB Metro
7,"[4.83404, 52.36437]",Point,1992,999999,17904,Station,Postjesweg,1997,,,,Feature,51.0,2074-51,GVB Metro
8,"[4.83547, 52.37286]",Point,1992,999999,17903,Station,Jan van Galenstraat,1997,,,,Feature,51.0,2074-51,GVB Metro
9,"[4.83442, 52.35219]",Point,1992,999999,17906,Station,Heemstedestraat,1997,,,,Feature,51.0,2074-51,GVB Metro
