In [13]:
import os
import zipfile

import pandas as pd

The 'send' function is needed to send a randomly chosen file from local storage (it can be ether excel spreadsheet, csv-file or zip file) to the database whicsh is running also locally, the 'select' function returns a table, that was read according to the SQL script, located in the file, path to which is sent as a argument. Of course, if change the credentials every other database can be accessed.

In [1]:
cred = {
    "host": "localhost",
    "dbname": "yukontaf",
    "user": "glebsokolov",
    "password": "",
}
from sqlalchemy import create_engine

con = create_engine(
    f'postgresql://{cred["user"]}:{cred["password"]}@{cred["host"]}/{cred["dbname"]}'
)


def select(sql, con):
    try:
        with open(sql) as f:
            return pd.read_sql(f.read(), con)
    except:
        print("Table not found in this relation or there a mistake in the script")

In [10]:
def send(name=None, table=None, path=None, sheetn=None, con=con):
    if name:
        return table.to_sql(name, if_exists="replace", con=con)
    elif path and name:
        if os.path.basename(path).endswith(".zip"):
            with zipfile.ZipFile(path, "r") as zip_ref:
                folder = os.path.dirname(path) + "/" + name
                zip_ref.extractall(folder)
                os.remove(path)
            for root, dirs, files in os.walk(folder):
                for file in files:
                    if file.endswith(".csv") or file.endswith(".txt"):
                        if file.endswith(".txt"):
                            sep = " "
                        if name != "zip":
                            pd.read_csv(os.path.join(root, file)).to_sql(
                                name + "." + file[:-4], if_exists="replace", con=con
                            )
                        else:
                            pd.read_csv(os.path.join(root, file), sep=sep).to_sql(
                                file[:-4], if_exists="replace", con=con
                            )
                    elif file.endswith(".json"):
                        pd.read_json(os.path.join(root, file)).to_sql(
                            file[:-5], if_exists="replace", con=con
                        )
                    os.remove(os.path.join(root, file))
                return os.rmdir(folder)
        elif os.path.basename(path).endswith(".csv"):
            return pd.read_csv(path).to_sql(name, if_exists="replace", con=con)
        elif os.path.basename(path).endswith(".xlsx"):
            if sheetn:
                return pd.read_excel(path, sheet_name=sheetn).to_sql(
                    name, if_exists="replace", con=con
                )
            else:
                sheetn = input(f"Choose Sheet:{pd.ExcelFile(path).sheet_names}")
                return pd.read_excel(path, sheet_name=sheetn).to_sql(
                    name, if_exists="replace", con=con
                )
    elif path:
        if os.path.basename(path).endswith(".csv") or os.path.basename(path).endswith(
            ".zip"
        ):
            return pd.read_csv(path).to_sql(
                os.path.basename(path)[:-4], if_exists="replace", con=con
            )
        elif os.path.basename(path).endswith(".xlsx"):
            if sheetn:
                return pd.read_excel(path, sheet_name=sheetn).to_sql(
                    os.path.basename(path)[:-5], if_exists="replace", con=con
                )
            else:
                sheetn = input(f"Choose Sheet:{pd.ExcelFile(path).sheet_names}")
                return pd.read_excel(path, sheet_name=sheetn).to_sql(
                    os.path.basename(file)[:-5], if_exists="replace", con=con
                )

Next, I will process some data

We will use two datasets provided by the World Bank:
    
* International Arrival-- How many tourist have come to the country
 * https://data.worldbank.org/indicator/ST.INT.ARVL
* Receipts--How mach they spent in the country (how much the locals received)
 * https://data.worldbank.org/indicator/ST.INT.RCPT.CD

Because both dataset mix data for countries with regional aggregates, eg. `CEB` for Central Europe and Baltics.
It also contains columns between `1960` and `2018` but data are populated since `1995` only. 
For that reason we filter out the regions and drop the empty columns

In [8]:
def process_world_bank_dataset(path):
    df = pd.read_csv(path, skiprows=4)

    # load country_codes (from https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes)
    country_codes = pd.read_csv("Country_Codes.csv")

    # info about regions (part of the world bank data package)
    regions = pd.read_csv("Metadata.csv")

    # merge country codes to have only countries
    df = df.merge(
        country_codes["ISO3"], left_on="Country Code", right_on="ISO3", how="inner"
    )
    df = df.merge(regions[["Country Code", "Region"]], on="Country Code")

    # drop unnecesary columns
    df.drop(columns=["Indicator Name", "Indicator Code", "ISO3"], inplace=True)

    # drop empty columns and row which have not data
    # `axis=0` for rows and `axis=1` for columns
    df = df.dropna(how="all", axis="rows")
    df = df.dropna(how="all", axis="columns")

    return df


def melt_world_bank(df, value_name):
    return df.melt(
        id_vars=["Country Name", "Country Code", "Region"],
        var_name="years",
        value_name=value_name,
    )

In [14]:
wide_arrivals = process_world_bank_dataset("API_ST.INT.ARVL_DS2_en_csv_v2_1345483.csv")
wide_receipts = process_world_bank_dataset(
    "API_ST.INT.RCPT.CD_DS2_en_csv_v2_1351575.csv"
)

print(wide_arrivals.shape, wide_receipts.shape)

# melt the data frames to create long dataframe
long_arr = melt_world_bank(wide_arrivals, "visitors")
long_receipts = melt_world_bank(wide_receipts, "receipts")

long_df = long_arr.merge(
    long_receipts, on=["Country Name", "Country Code", "years", "Region"]
).fillna(0)
long_df

(215, 27) (215, 27)


Unnamed: 0,Country Name,Country Code,Region,years,visitors,receipts
0,Aruba,ABW,Latin America & Caribbean,1995,619000.0,5.540000e+08
1,Afghanistan,AFG,South Asia,1995,0.0,0.000000e+00
2,Angola,AGO,Sub-Saharan Africa,1995,9000.0,2.700000e+07
3,Albania,ALB,Europe & Central Asia,1995,0.0,7.000000e+07
4,Andorra,AND,Europe & Central Asia,1995,0.0,0.000000e+00
...,...,...,...,...,...,...
5155,Samoa,WSM,East Asia & Pacific,2018,164000.0,1.913000e+08
5156,"Yemen, Rep.",YEM,Middle East & North Africa,2018,0.0,0.000000e+00
5157,South Africa,ZAF,Sub-Saharan Africa,2018,10472000.0,9.789000e+09
5158,Zambia,ZMB,Sub-Saharan Africa,2018,1072000.0,7.420000e+08


In [6]:
wide_arrivals.to_pickle("arr.plk")
wide_receipts.to_pickle("rec.plk")
long_df.to_pickle("long.plk")

In [11]:
send(name="worldbank_df", table=long_df)

160

The return code received above means that the data has successfully been written to the database.