# Criação de dados

A proposta deste notebook é criar um dataframe de dados mockados contendo as seguintes colunas:
- id
- data
- usuário
- empresa

In [60]:
import pandas as pd
from faker import Faker
import uuid
from datetime import datetime
from random import choice

In [13]:
fake = Faker(["pt_BR"])

In [29]:
# Quantidade de registros na tabela

lines = 100_000

In [62]:


# Criação de dados 

base_company_list = [fake.unique.company_id() for _ in range(int(lines/100))] # 1k 
base_user_list = [fake.unique.email() for _ in range(int(lines/5))] # 20k
user_company = [(base_user, choice(base_company_list)) for base_user in base_user_list]
full_user_company = [choice(user_company) for _ in range(lines)]
# deste modo eu garanto que sempre um mesmo user vai pertencer a uma mesma empresa


data = {
    "id": [uuid.uuid4() for _ in range(lines)],
    "data": [fake.date_time_between(start_date=datetime(2022, 1, 1)) for _ in range(lines)],
    "username": [user[0] for user in full_user_company],
    "company": [company[1] for company in full_user_company],
}


In [63]:
# Criação do dataframe

df = pd.DataFrame(data)

In [64]:
df.head()

Unnamed: 0,id,data,username,company
0,9d9a055f-ed10-4330-a904-6b32ccd13b8a,2022-05-03 22:39:56,maria-claracarvalho@example.org,9713245000130
1,6b777185-623a-44d7-8c7a-3a6493874f60,2022-11-27 14:44:13,carlos-eduardo98@example.org,82340567000171
2,d636aa25-5e06-4ccd-8725-5f37d89fc057,2023-08-19 19:50:08,felipefreitas@example.org,71032986000120
3,8bc06af1-7266-4465-be21-55604ce9d657,2022-06-05 19:23:34,joao94@example.org,97184036000104
4,bd8024c5-ff94-401f-bf32-3a025dee3c62,2023-07-06 21:23:52,pedro-lucasfarias@example.net,69532871000152


In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   id        100000 non-null  object        
 1   data      100000 non-null  datetime64[ns]
 2   username  100000 non-null  object        
 3   company   100000 non-null  object        
dtypes: datetime64[ns](1), object(3)
memory usage: 3.1+ MB


In [66]:
# Gerando o arquivo SQL

with open("../docker_databases/init.sql", "w") as f:
    f.write("\n\n-- Criação da tabela")
    f.write("""
    CREATE TABLE t_ponto (
          id VARCHAR(255) PRIMARY KEY,
          data TIMESTAMP,
          username VARCHAR(255),
          company VARCHAR(255)
    );\n
    """)
    
    f.write("\n\n-- Exportação dos dados\n\n")
    for i in df.index:
        f.write(f"INSERT INTO t_ponto (id, data, username, company) VALUES ('{str(df.iloc[i, 0])}', '{df.iloc[i, 1]}', '{df.iloc[i, 2]}', '{df.iloc[i, 3]}');\n")

In [67]:
print("Fim")

Fim
