In [1]:
import pyarrow

In [2]:
import os
import random
import string
import pandas as pd
from datetime import datetime
import time
import json
import numpy as np

def generate_random_variable(value_type: str) -> str:
    """
    Generate a random variable by string type.

    Args:
        value_type: Variable type.

    Returns:
        Respective type variable.
    """

    if value_type == 'int':
        return str(random.randrange(20))
    elif value_type == 'str':
        letters = string.ascii_lowercase
        return ''.join(random.choice(letters) for _ in range(10))
    elif value_type == 'phone':
        number = [str(random.randrange(9)) for _ in range(9)]
        return ''.join(number)
    elif value_type == 'long_int':
        number = [str(random.randrange(9)) for _ in range(11)]
        return ''.join(number)
    elif value_type == 'float':
        return str(random.random()).replace(".", ",")
    elif value_type == 'operation':
        return random.choice(['INSERT', 'UPDATE', 'DELETE'])
    elif value_type == 'timestamp':
#         return str(time.time())
        return "1563551566.9326966"
    elif value_type == 'date':
        return datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H-%M-%S')
    else:
        return ''


def generate_random_row(cols: dict) -> dict:
    """
    Generate a dataframe row with random values. The number of columns and its type are defined by ``cols`` parameter.

    Args:
        cols: Dictionary with the dataframe respective cols and its type.

    Returns:
        Dictionary row of the dataframe.
    """

    row = {}
    for key, value in cols.items():
        row[key] = generate_random_variable(value)

    return row

In [3]:
df = []
num_rows = 20
cols = {
            'unique_id': 'int',
            'name': 'str',
            'phone': 'phone',
            'cpf': 'long_int',
            'age': 'int',
            'budget': 'float',
            'timestamp': 'timestamp',
            'line': 'str',
            'retry': 'int',
            'update_date': 'date',
            'source_files': 'str'
        }

for _ in range(num_rows):
    df.append(generate_random_row(cols))

df = pd.DataFrame(df)
df['unique_id'] = np.random.permutation(num_rows)

In [4]:
df

Unnamed: 0,age,budget,cpf,line,name,phone,retry,source_files,timestamp,unique_id,update_date
0,18,9387888614917022,72011821002,taxcvuhodh,hxpevhgkxj,631352582,2,txezeplqbo,1563551566.9326966,15,2019-07-25-11-18-40
1,6,8323758282817877,37370402438,gazzmrfhzk,qblvdpkggj,523453733,10,xvxtxcnmti,1563551566.9326966,9,2019-07-25-11-18-40
2,13,938481444935233,28718713071,ysqxqhlvgb,mlwhecflmx,682178076,19,apebuoykdb,1563551566.9326966,5,2019-07-25-11-18-40
3,0,4303791150095593,50576615770,rxrpewfrug,nrlwltwifa,546848313,9,wgkgsgtwmf,1563551566.9326966,14,2019-07-25-11-18-40
4,7,746746715834829,86015107566,vtgacbwipm,crzwjhubpb,655836270,0,rivdxrwjea,1563551566.9326966,8,2019-07-25-11-18-40
5,6,9546045939852003,42670614412,sfubxxwcnx,ugcomjmgnp,827373533,9,bkbkhibjew,1563551566.9326966,13,2019-07-25-11-18-40
6,10,9419315657665351,78123473570,kgiqbfntdx,gukzvwnoos,645458382,13,rkyziuszja,1563551566.9326966,1,2019-07-25-11-18-40
7,15,2882736557565153,74131668513,bmomgcjkej,chuamcsbdr,15875830,8,nexfjkjlaz,1563551566.9326966,6,2019-07-25-11-18-40
8,19,11065196887310647,31366554813,lycygdmola,esqreodoup,466757023,2,ruzkdnaoak,1563551566.9326966,2,2019-07-25-11-18-40
9,10,809113266119962,53006851844,mizegyrzhb,mmhuthsiew,880455667,13,dztufegcef,1563551566.9326966,4,2019-07-25-11-18-40


In [5]:
df.to_parquet('../GIT/rdr010/data_lake/tests/resources/test_data.parquet', index=False)

In [6]:
import pyarrow as pa
t = pa.parquet.read_table('../GIT/rdr010/data_lake/tests/resources/test_data.parquet')

In [7]:
bt = bytearray('../GIT/rdr010/data_lake/tests/resources/test_data.parquet', 'utf-8')

In [8]:
with open('../GIT/rdr010/data_lake/tests/resources/test_data.parquet', mode='rb') as file:
    t = file.read()

In [11]:
import io
df2 = pd.read_parquet(io.BytesIO(t))

In [12]:
df2["unique_id"]

0     15
1      9
2      5
3     14
4      8
5     13
6      1
7      6
8      2
9      4
10    16
11    10
12    12
13    19
14    18
15     3
16     7
17    11
18     0
19    17
Name: unique_id, dtype: int64