# Criação de dados

A proposta deste notebook é criar um dataframe de dados mockados contendo as seguintes colunas:
- id
- data
- usuário
- empresa

In [60]:
import pandas as pd
from faker import Faker
import uuid
from datetime import datetime
from random import choice

In [13]:
fake = Faker(["pt_BR"])

In [29]:
# Quantidade de registros na tabela

lines = 100_000

In [80]:


# Criação de dados 

base_company_list = [fake.unique.company_id() for _ in range(int(lines/100))] # 1k 
base_user_list = [fake.unique.email() for _ in range(int(lines/5))] # 20k
user_company = [(base_user, choice(base_company_list)) for base_user in base_user_list]
full_user_company = [choice(user_company) for _ in range(lines)]
# deste modo eu garanto que sempre um mesmo user vai pertencer a uma mesma empresa


data = {
    "id": [uuid.uuid4() for _ in range(lines)],
    "insert_date": [fake.date_time_between(start_date=datetime(2023, 10, 1)) for _ in range(lines)],
    "username": [user[0] for user in full_user_company],
    "company": [company[1] for company in full_user_company],
}


In [81]:
# Criação do dataframe

df = pd.DataFrame(data)

In [82]:
df.head()

Unnamed: 0,id,insert_date,username,company
0,5a3aa845-4af5-4fc2-bd09-0b4e81c82de0,2023-10-05 01:04:09,alana55@example.com,25316098000111
1,e2c82415-c478-456d-a145-b8eb462ce9e2,2023-10-01 17:28:28,da-motaisabelly@example.org,38615729000145
2,eaa5b4a9-e06d-4210-abbb-0c7488701510,2023-10-15 00:05:35,evelyn73@example.org,82350176000138
3,2e68b5fc-a014-47ba-8e36-60c4d37fd6e7,2023-10-15 21:01:54,santoshenrique@example.net,34726081000123
4,02dbf5b6-66ac-409d-829b-9d2d7f30cbd3,2023-10-08 07:06:08,peixotomaria-sophia@example.org,79326185000189


In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   id           100000 non-null  object        
 1   insert_date  100000 non-null  datetime64[ns]
 2   username     100000 non-null  object        
 3   company      100000 non-null  object        
dtypes: datetime64[ns](1), object(3)
memory usage: 3.1+ MB


In [84]:
# Gerando o arquivo SQL

# Este foi o filename que usei ao rodar no meu pc para criar o script
filename = "../docker_databases/init.sql"

# Estou deixando este apenas para que possa executar no seu computador sem comprometer o resultado
# das analises que realizei.
# filename = "../docker_databases/init_avaliacao.sql"

with open(filename, "w") as f:
    f.write("\n\n-- Criação da tabela")
    f.write("""
    CREATE TABLE t_ponto (
          id VARCHAR(255) PRIMARY KEY,
          insert_date TIMESTAMP,
          username VARCHAR(255),
          company VARCHAR(255)
    );\n
    """)
    
    f.write("\n\n-- Exportação dos dados\n\n")
    for i in df.index:
        f.write(f"INSERT INTO t_ponto (id, insert_date, username, company) VALUES ('{str(df.iloc[i, 0])}', '{df.iloc[i, 1]}', '{df.iloc[i, 2]}', '{df.iloc[i, 3]}');\n")

In [85]:
print("Fim")

Fim
