In [1]:
import pandas as pd
import sys
import yaml
from sqlalchemy import create_engine
import os.path
import cx_Oracle
if os.path.isfile('postgres_credentials.txt'): 
    with open('postgres_credentials.txt') as f:
        engine = create_engine(f.read())
elif os.path.isfile('oracle_credentials.yml'):
    with open("oracle_credentials.yml", 'r') as f:
        credentials = yaml.load(f)
    import sqlalchemy as sa
    oracle_db = sa.create_engine(credentials["connstring"])
    cx_Oracle.init_oracle_client(lib_dir=credentials["clientpath"])
    engine = oracle_db.connect()
    # dsn_tns = cx_Oracle.makedsn(credentials["ip"], credentials["port"], credentials["sid"])
    # engine = cx_Oracle.connect(credentials["user"], credentials["pwd"], dsn_tns)
    # make sure that the user exists in Oracle; e.g.,
    # CREATE USER ... IDENTIFIED BY ...;
    # GRANT ALL PRIVILEGES TO ...;
    # cur = engine.cursor()
    # cur.execute("select 1 from dual")
    # pd.DataFrame(columns=["bar"]).to_sql('foo', engine, if_exists='replace', index=False)
df = pd.read_csv("dataset_weekly.csv", delimiter=",")
df = df[["country", "continent", "country_code", "population", "indicator", "weekly_count", "year_week"]]
df

  credentials = yaml.load(f)


Unnamed: 0,country,continent,country_code,population,indicator,weekly_count,year_week
0,Afghanistan,Asia,AFG,38928341,cases,0,2020-01
1,Afghanistan,Asia,AFG,38928341,cases,0,2020-02
2,Afghanistan,Asia,AFG,38928341,cases,0,2020-03
3,Afghanistan,Asia,AFG,38928341,cases,0,2020-04
4,Afghanistan,Asia,AFG,38928341,cases,0,2020-05
...,...,...,...,...,...,...,...
30327,Zimbabwe,Africa,ZWE,14862927,deaths,4,2021-20
30328,Zimbabwe,Africa,ZWE,14862927,deaths,8,2021-21
30329,Zimbabwe,Africa,ZWE,14862927,deaths,12,2021-22
30330,Zimbabwe,Africa,ZWE,14862927,deaths,26,2021-23


Drop duplicates / null

In [2]:
df = df.drop_duplicates().dropna()
df = df[["country", "continent", "population", "indicator", "weekly_count", "year_week"]]
df

Unnamed: 0,country,continent,population,indicator,weekly_count,year_week
0,Afghanistan,Asia,38928341,cases,0,2020-01
1,Afghanistan,Asia,38928341,cases,0,2020-02
2,Afghanistan,Asia,38928341,cases,0,2020-03
3,Afghanistan,Asia,38928341,cases,0,2020-04
4,Afghanistan,Asia,38928341,cases,0,2020-05
...,...,...,...,...,...,...
30327,Zimbabwe,Africa,14862927,deaths,4,2021-20
30328,Zimbabwe,Africa,14862927,deaths,8,2021-21
30329,Zimbabwe,Africa,14862927,deaths,12,2021-22
30330,Zimbabwe,Africa,14862927,deaths,26,2021-23


Do some manual value mapping

In [3]:
# type IRIS (H = habitation, A = activity, D = autre, Z = Non disponible)
df["year"] = df["year_week"].apply(lambda x: x.split("-")[0])
def pick_month(x):
    return x 
    # week = int(x.split("-")[1]) - 1
    # year = x.split("-")[0]
    # if week <= 20:
    #     return year + "-" + str(int(week / 5))
    # else:
    #     week -= 20
    #     return year + "-" + str(int(week / 4) + 4)

def replace_month(x):
    month = int(x.split("-")[1])-1
    year = x.split("-")[0]
    if month < 5:
        month='JAN'
    elif month < 9:
        month='FEB'
    elif month < 14:
        month='MAR'
    elif month < 18:
        month='APR'
    elif month < 23:
        month='MAY'
    elif month < 27:
        month='JUN'
    elif month < 32:
        month='JUL'
    elif month < 37:
        month='AGO'
    elif month < 41:
        month='SEP'
    elif month < 45:
        month='OCT'
    elif month < 49:
        month='NOV'
    elif month < 53:
        month='DEC'
    else:
        print(x)
        sys.exit(1)
    return year + "-" + month

df["month"] = df["year_week"].apply(lambda x: replace_month(pick_month(x)))
df["year_week"] = df.apply(lambda x: x["month"] + "-" + x["year_week"].split("-")[1], axis=1)
df = df.rename(columns={'year_week': 'week'})

df

Unnamed: 0,country,continent,population,indicator,weekly_count,week,year,month
0,Afghanistan,Asia,38928341,cases,0,2020-JAN-01,2020,2020-JAN
1,Afghanistan,Asia,38928341,cases,0,2020-JAN-02,2020,2020-JAN
2,Afghanistan,Asia,38928341,cases,0,2020-JAN-03,2020,2020-JAN
3,Afghanistan,Asia,38928341,cases,0,2020-JAN-04,2020,2020-JAN
4,Afghanistan,Asia,38928341,cases,0,2020-JAN-05,2020,2020-JAN
...,...,...,...,...,...,...,...,...
30327,Zimbabwe,Africa,14862927,deaths,4,2021-MAY-20,2021,2021-MAY
30328,Zimbabwe,Africa,14862927,deaths,8,2021-MAY-21,2021,2021-MAY
30329,Zimbabwe,Africa,14862927,deaths,12,2021-MAY-22,2021,2021-MAY
30330,Zimbabwe,Africa,14862927,deaths,26,2021-MAY-23,2021,2021-MAY


Flatten indicator to deaths and cases

In [4]:
def df_to_row(x):
    cases = x[x["indicator"] == "cases"]["weekly_count"].tolist()[0]
    deaths = x[x["indicator"] == "deaths"]["weekly_count"].tolist()[0]
    df = pd.DataFrame(columns = ["deaths", "cases"])
    df.loc[0] = [deaths, cases]
    # df["deaths"] = deaths
    # df["cases"] = cases
    return df
    
df = df.groupby(["country", "continent", "population", "week", "year", "month"]).apply(lambda x: df_to_row(x)).reset_index()
df

Unnamed: 0,country,continent,population,week,year,month,level_6,deaths,cases
0,Afghanistan,Asia,38928341,2020-AGO-33,2020,2020-AGO,0,63,542
1,Afghanistan,Asia,38928341,2020-AGO-34,2020,2020-AGO,0,12,403
2,Afghanistan,Asia,38928341,2020-AGO-35,2020,2020-AGO,0,15,163
3,Afghanistan,Asia,38928341,2020-AGO-36,2020,2020-AGO,0,10,236
4,Afghanistan,Asia,38928341,2020-AGO-37,2020,2020-AGO,0,8,318
...,...,...,...,...,...,...,...,...,...
14699,Zimbabwe,Africa,14862927,2021-MAY-19,2021,2021-MAY,0,6,141
14700,Zimbabwe,Africa,14862927,2021-MAY-20,2021,2021-MAY,0,4,122
14701,Zimbabwe,Africa,14862927,2021-MAY-21,2021,2021-MAY,0,8,262
14702,Zimbabwe,Africa,14862927,2021-MAY-22,2021,2021-MAY,0,12,245


In [5]:
df.drop(labels=["level_6"], inplace=True, axis=1)
ft = df[["country", "deaths", "cases", "week"]].drop_duplicates()
ft.to_csv("generated/ft.csv", index=False)
dt1 = df[["country", "continent", "population"]].drop_duplicates()
dt1.to_csv("generated/dt_space.csv", index=False)
dt2 = df[["year", "month", "week"]].drop_duplicates()
dt2.to_csv("generated/dt_time.csv", index=False)

Write the dataframe to oracle (if needed)

Note that this writes strings as Oracle CLOB, which are a mess to join. An ugly workaround is

```
create table foo (country varchar2(255), continent varchar2(255), population varchar2(255));
insert into foo select country, continent, population from dt_space;
drop table dt_space;
rename foo to dt_space;

create table foo (week varchar2(255), year varchar2(255), month varchar2(255));
insert into foo select week, year, month from dt_time;
drop table dt_time;
rename foo to dt_time;

create table foo (week varchar2(255), country varchar2(255), deaths int, cases int);
insert into foo select week, country, deaths, cases from ft;
drop table ft;
rename foo to ft;

alter table ft add primary key(week, country);
alter table dt_space add primary key(country);
alter table dt_time add primary key(week);
alter table ft ADD CONSTRAINT fk_time foreign key (week) references dt_time(week);
alter table ft ADD CONSTRAINT fk_space foreign key (country) references dt_space(country);
```

In [8]:
# df.to_sql('covid_raw_data', engine, if_exists='replace', index=False)
ft.to_sql('ft', engine, if_exists='replace', index=False, chunksize=10000) # , method='multi'
print("Done ft")
dt1.to_sql('dt_space', engine, if_exists='replace', index=False, chunksize=10000)  # , method='multi'
print("Done dt_space")
dt2.to_sql('dt_time', engine, if_exists='replace', index=False, chunksize=10000)  # , method='multi'
print("Done dt_time")

Done dt_space
Done dt_time
