# Basic program ETL ( load, fix data types, fix duplicity, write and read) for 

# running all steps you need go menu Cell/RunAll

# *

# Starting

# *

## Import librarys and initial on the Pandas

In [1]:
import pandas as pd
import numpy as np
import glob
from datetime import datetime
import json

## Method for convert data types from according map

In [2]:
def fix_dtypes(df, list_col_dtypes):

    for k, v in list_col_dtypes.items():

        if v in ["int", "bigint"]:
            df[k] = pd.to_numeric(df[k], errors='coerce')
            df = df.replace(np.nan, 0, regex=True)
            df[k] = df[k].astype(int)
        elif v == "datetime64":
            df[k] = pd.to_datetime(df[k])
        elif v == "str":
            df[k] = df[k].apply(str)
        
    return df

## Load file CSV that was write in another program ( write_csv.ipynb )

In [3]:
def load_csv_pandas(path_generated):
    df = pd.DataFrame()
    df = pd.read_csv(path_generated, index_col=0, header=0, low_memory=False)
    df.reset_index(inplace=True)
    return df

## Load mapping that has data types corret, primary key of data frame and column that garantees uniqueless of line.

In [4]:
def load_mapping():
    
    list_col_dtypes = {"id": "int",
                       "randow": "str",
                       "update_timestamp": "datetime64",
                       "boolean": "str"}
    
    col_name_last_update = "update_timestamp"
    
    primary_key = ["id"]
    
    return [primary_key, col_name_last_update, list_col_dtypes]

## Method for remove duplicity of dataframe

In [5]:
def fix_duplicity(df, list_col_dtypes, primary_key, col_name_last_update):
    
    df = fix_dtypes(df, list_col_dtypes)
    df.set_index(primary_key, inplace=True)
    df["rank_tmp"] = df.groupby(df.index)[col_name_last_update].rank(method="first", ascending=False)
    df = df.loc[(df["rank_tmp"] == 1)]
    df.reset_index(inplace=True)
    df.drop(["rank_tmp"], axis=1, inplace=True)
    
    return df

## Write file parquet in path

In [6]:
def write_parquet(df, path):
    df.to_parquet(path=path, compression='snappy', use_deprecated_int96_timestamps=True)

## Read file parquet in path

In [7]:
def read_parquet(path):
    files = glob.glob("{}*.snappy.parquet".format(path))
    data = [pd.read_parquet(f) for f in files]
    df = pd.concat(data,ignore_index=True)
    return df

## Process that final code output time execution

In [8]:
def process(number_rows):
    #Read file in storage
    path_csv = "/home/jovyan/work/artifacts/data/input/users/generated_{}_rows.csv".format(number_rows)

    df = load_csv_pandas(path_csv)

    #Load mapping
    data = load_mapping()

    primary_key = data[0]
    col_name_last_update = data[1]
    list_col_dtypes = data[2]

    #Duplicity in file - Fix
    df = fix_duplicity(df, list_col_dtypes, primary_key, col_name_last_update)

    #Write parquet
    path_parquet = "/home/jovyan/work/artifacts/data/output/pandas/file_pandas.snappy.parquet"
    write_parquet(df, path_parquet)

    #Read parquet
    path_parquet = "/home/jovyan/work/artifacts/data/output/pandas/"
    df = read_parquet(path_parquet)
    
    return df

## ## Call process and you put the number of line that you write file csv before.

In [9]:
start = datetime.now()
df = process(number_rows=100)
end = datetime.now()
print(end - start)

0:00:00.085846


## Is this script makes a count on the dataframe

In [10]:
start = datetime.now()
print("number of line on the DF {}".format(df.shape[0]))
end = datetime.now()
print(end - start)

number of line on the DF 100
0:00:00.000289
