### Import libraries

In [1]:
import pysftp
import re
import numpy as np
import pandas as pd

from urllib3.exceptions import InsecureRequestWarning
from urllib3 import disable_warnings
import warnings
disable_warnings(InsecureRequestWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
pd.options.mode.chained_assignment = None

### Functions used in a project

In [None]:
#traversing iterator through list of lists
def traverse_objects(obj, types = (list, tuple)):
    if isinstance(obj, types):
        for value in obj:
            for subvalue in traverse_objects(value, types):
                yield subvalue
    else:
        yield obj

### Import hdf from sftp server

In [4]:
sftpHost = "localhost"
sftpPort = 22
uname = "wojci"
privateKeyFilePath = "./id_rsa"

cnOpts = pysftp.CnOpts()
cnOpts.hostkeys = None

with pysftp.Connection(host=sftpHost, username=uname, private_key=privateKeyFilePath, cnopts=cnOpts) as sftp:
    print("Connected to sftp server")
    sftp.cwd("./beer_files")
    sftp.get("./beer_files.h5", "./downloaded_from_sftp/beers.h5", preserve_mtime=True)
    print('File "beers.h5" downloaded correctly')

Connected to sftp server
File "beers.h5" downloaded correctly


### Read imported file as DataFrame

In [6]:
df = pd.read_hdf('./downloaded_from_sftp/beers.h5', key='df', mode='r')
df.head()

Unnamed: 0,beer_name,name of style,name_of_substyle,beer_country,beer_state,rank_in_style,score,rank,abv [%],avg_score,...,brewery_city,brewery_state,brewery_country,brewery_map,brewery_website,brewery_type,brewery_adress,brewery_postal_code,brewery_telephone,brewery_notes
0,Ayinger Celebrator,Bocks,Bock - Doppelbock,Germany,,1.0,96.0,879.0,6.7,4.34,...,Aying,,Germany,https://maps.google.com/maps?oi=map&q=M%C3%BCn...,http://ayinger-bier.de,"[Brewery, Bar, Eatery]",Münchener Straße 21,85653,08095-90650,
1,Troegenator,Bocks,Bock - Doppelbock,United States,Pennsylvania,48.0,88.0,13560.0,8.2,3.95,...,Hershey,Pennsylvania,United States,https://maps.google.com/maps?oi=map&q=200+East...,http://troegs.com,"[Brewery, Bar, Eatery, Beer-to-go]",200 East,17033,(717) 534-1297,Sunday-Wednesday 11am-9pmThursday - sat 11am-1...
2,Spaten Optimator,Bocks,Bock - Doppelbock,Germany,,75.0,87.0,16263.0,7.6,3.9,...,München,,Germany,https://maps.google.com/maps?oi=map&q=Mars+Str...,http://franziskaner-weissbier.de,[Brewery],Mars Strasse 46-48,80335,(089) 51 221,OWNED BY ANHEUSER-BUSCH INBEVSee also: http://...
3,Salvator,Bocks,Bock - Doppelbock,Germany,,57.0,88.0,14654.0,7.9,3.93,...,München,,Germany,https://maps.google.com/maps?oi=map&q=Hochstra...,http://paulaner.de,[Brewery],Hochstraße 75,81541,089 / 4 80 051,
4,Weihenstephaner Korbinian,Bocks,Bock - Doppelbock,Germany,,5.0,93.0,2924.0,7.4,4.2,...,Freising,,Germany,https://maps.google.com/maps?oi=map&q=Alte+Aka...,http://weihenstephaner.de,"[Brewery, Bar, Eatery, Beer-to-go]",Alte Akademie 2,85354,+49 8161 5360,


### Lists with unique values for certain columns which will be used as a dimensions in sql database

In [7]:
#dict of countries and states
country_states_dict = dict.fromkeys(list(df['beer_country'].unique()), np.nan)
for key in country_states_dict:
    mask_country = df['beer_country'] == key
    country_states_dict[key] = list(df['beer_state'].loc[mask_country].unique())

#dict of styles and substyles
beer_styles_list = list(df['name of style'].unique())
beer_styles_dict = dict.fromkeys(beer_styles_list, np.nan)
for key in beer_styles_dict:
    mask_style = df['name of style'] == key
    beer_styles_dict[key] = list(df['name_of_substyle'].loc[mask_style].unique())
    
#list of abv
abv_list = list(df['abv [%]'].unique())
abv_db_key = np.arange(0, max(abv_list) + 8.1, 0.1)

#list of breweries
brewery_list = list(df['brewery_name'].unique())

#list of unique brewery types
brewery_types_unique_list = []
for value in traverse_objects(list(df['brewery_type'])):
    if value.strip() not in brewery_types_unique_list:
        brewery_types_unique_list.append(value.strip())
brewery_types_unique_list
#also we can use as a dimension: active, date_added, 

NameError: name 'traverse_objects' is not defined