### EDA (Exploratory Data Analysis) & Data Cleaning

In [1]:
# Parameters
credentials_path = "/mnt/c/Temp/desafiolh-445818-3cb0f62cb9ef.json"
input_table = "desafioadventureworks-446600.raw_data.humanresources_employee"
output_table = "desafioadventureworks-446600.raw_data_cleaned.humanresources_employee"


In [2]:

##import das bibliotecas e adequando colunas, linhas e formato de números

from google.cloud import bigquery
from dotenv import load_dotenv
import pandas as pd
import pandas_gbq as gbq
from google.oauth2 import service_account
from google.cloud.bigquery_storage import BigQueryReadClient
import seaborn as sns
import numpy as np
import os
import re
import json

# Carrega o .env
load_dotenv()

# Detectar ambiente
if os.name == "nt":  # Windows
    credentials_path = r"C:\Temp\desafiolh-445818-3cb0f62cb9ef.json"
else:  # WSL/Linux
    credentials_path = "/mnt/c/Temp/desafiolh-445818-3cb0f62cb9ef.json"


# Parâmetros injetados pelo Papermill ou definidos manualmente
if 'tables_to_process' not in locals():
    tables_to_process = [
        "desafioadventureworks-446600.raw_data.humanresources_employee"       
    ]

if 'output_dataset' not in locals():
    output_dataset = "desafioadventureworks-446600.raw_data_cleaned"


# Configurar o cliente do BigQuery com project e location dinâmicos
credentials = service_account.Credentials.from_service_account_file(credentials_path)
client = bigquery.Client(credentials=credentials, project=os.getenv("BIGQUERY_PROJECT"), location="us-central1")


# Verificar se a configuração está correta
print("Credenciais do BigQuery:", os.getenv("GOOGLE_APPLICATION_CREDENTIALS"))

# Verifica se a variável está configurada
print(os.getenv("GOOGLE_APPLICATION_CREDENTIALS"))




Credenciais do BigQuery: /mnt/c/Temp/desafiolh-445818-3cb0f62cb9ef.json
/mnt/c/Temp/desafiolh-445818-3cb0f62cb9ef.json


In [3]:
print("Tabelas a processar:", tables_to_process)


Tabelas a processar: ['desafioadventureworks-446600.raw_data.humanresources_employee']


In [4]:
credentials = service_account.Credentials.from_service_account_file(credentials_path)
client = bigquery.Client(credentials=credentials, project="desafioadventureworks-446600", location="us-central1")


# # Configurar o cliente do BigQuery
# client = bigquery.Client()

# Nome do dataset e tabela
dataset_id = 'raw_data'


In [5]:
# Configurar Pandas para exibir todas as colunas e todas as linhas completas
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)


pd.options.display.float_format = '{:.2f}'.format

In [6]:
# Listar tabelas no dataset
tables = client.list_tables('raw_data')
print("Tabelas disponíveis:")
for table in tables:
    print(table.table_id)




Tabelas disponíveis:
humanresources_employee


In [7]:
# Inicializar o cliente do BigQuery
client = bigquery.Client(credentials=credentials, project="desafioadventureworks-446600", location="us-central1")

# Configurar o cliente do BigQuery com project e location dinâmicos
credentials = service_account.Credentials.from_service_account_file(credentials_path)

# Inicializar o cliente do BigQuery Storage
bqstorage_client = BigQueryReadClient(credentials=credentials)


# Dicionário para armazenar DataFrames processados
processed_data = {}

# Processar tabelas e armazenar DataFrames
for input_table in tables_to_process:
    print(f"Processando tabela: {input_table}")
    
    # Nome da tabela
    table_name = input_table.split(".")[-1]  # Extrai o nome da tabela
    
    # Etapa 1: Ler os dados da tabela do BigQuery com pyarrow
    print("Lendo os dados do BigQuery...")
    query = f"SELECT * FROM `{input_table}`"
    EDA_humanresources_employee_raw = client.query(query).to_dataframe(bqstorage_client=bqstorage_client)

    # Etapa 2: Transformar JSON em formato tabular
    print("Transformando os dados para formato tabular...")

    # Verificar se há colunas com dados em formato JSON
    if EDA_humanresources_employee_raw.shape[1] == 1 and isinstance(EDA_humanresources_employee_raw.iloc[0, 0], str):
        try:
            print("Normalizando dados JSON...")
            # Substituir `null` por `None` e carregar o JSON
            EDA_humanresources_employee = pd.json_normalize(
                EDA_humanresources_employee_raw.iloc[:, 0].apply(lambda x: json.loads(x.replace("null", "None")))
            )
        except Exception as e:
            print(f"Erro ao normalizar JSON: {e}")
            EDA_humanresources_employee = EDA_humanresources_employee  # Caso falhe, mantém os dados brutos
    else:
        EDA_humanresources_employee = EDA_humanresources_employee_raw

    # Armazenar o DataFrame limpo em um dicionário
    processed_data[table_name] = EDA_humanresources_employee
    print(f"Tabela {table_name} processada e armazenada com sucesso.")

# Após o loop, exibir uma mensagem de conclusão
print("Todas as tabelas foram processadas com sucesso!")


Processando tabela: desafioadventureworks-446600.raw_data.humanresources_employee
Lendo os dados do BigQuery...
Transformando os dados para formato tabular...
Tabela humanresources_employee processada e armazenada com sucesso.
Todas as tabelas foram processadas com sucesso!


In [8]:
print(EDA_humanresources_employee_raw.iloc[:, 0].head())


0             {"birthdate":"1969-01-29","businessentityid":1,"currentflag":true,"gender":"M","hiredate":"2009-01-14","jobtitle":"Chief Executive Officer","loginid":"adventure-works\\ken0","maritalstatus":"S","modifieddate":"2014-06-30T00:00:00","nationalidnumber":"295847284","organizationnode":"/","rowguid":"f01251e5-96a3-448d-981e-0f99d789110d","salariedflag":true,"sickleavehours":69,"vacationhours":99}
1    {"birthdate":"1971-08-01","businessentityid":2,"currentflag":true,"gender":"F","hiredate":"2008-01-31","jobtitle":"Vice President of Engineering","loginid":"adventure-works\\terri0","maritalstatus":"S","modifieddate":"2014-06-30T00:00:00","nationalidnumber":"245797967","organizationnode":"/1/","rowguid":"45e8f437-670d-4409-93cb-f9424a40d6ee","salariedflag":true,"sickleavehours":20,"vacationhours":1}
2          {"birthdate":"1974-11-12","businessentityid":3,"currentflag":true,"gender":"M","hiredate":"2007-11-11","jobtitle":"Engineering Manager","loginid":"adventure-works\\roberto0"

In [9]:
# def clean_and_load_json(value):
#     """Função para corrigir e carregar JSON."""
#     try:
#         # Substituir `null` por `None` e carregar o JSON
#         value = value.replace("null", "null")
#         return json.loads(value)
#     except Exception as e:
#         print(f"Erro ao processar JSON: {e}, valor problemático: {value}")
#         return None  # Retorna None se o valor for inválido

# # Normalizar os dados JSON
# print("Normalizando os dados JSON...")
# try:
#     EDA_humanresources_employee = pd.json_normalize(
#         EDA_humanresources_employee_raw.iloc[:, 0].apply(clean_and_load_json)
#     )
#     print("Dados normalizados com sucesso!")
# except Exception as e:
#     print(f"Erro ao normalizar os dados JSON: {e}")
#     EDA_humanresources_employee = EDA_humanresources_employee_raw  # Mantém os dados originais em caso de erro

# print(f"Tabela processada: {input_table}")
# print(EDA_humanresources_employee.head())




def clean_and_load_json(value):
    """Função para corrigir e carregar JSON."""
    try:
        # Substituir `null` por `None` e carregar o JSON
        value = value.replace("null", "None")
        return json.loads(value)
    except Exception as e:
        print(f"Erro ao processar JSON: {e}, valor problemático: {value}")
        return None  # Retorna None se o valor for inválido

# Normalizar os dados JSON
print("Normalizando os dados JSON...")
try:
    EDA_humanresources_employee = pd.json_normalize(
        EDA_humanresources_employee_raw.iloc[:, 0].apply(clean_and_load_json)
    )
    print("Dados normalizados com sucesso!")

    # Atribuir tipos às colunas
    EDA_humanresources_employee['businessentityid'] = EDA_humanresources_employee['businessentityid'].astype('int64', errors='ignore')
    EDA_humanresources_employee['nationalidnumber'] = EDA_humanresources_employee['nationalidnumber'].astype('int64', errors='ignore')
    EDA_humanresources_employee['loginid'] = EDA_humanresources_employee['loginid'].astype('int64', errors='ignore')
    EDA_humanresources_employee['jobtitle'] = EDA_humanresources_employee['jobtitle'].astype('str', errors='ignore')
    EDA_humanresources_employee['birthdate'] = pd.to_datetime(EDA_humanresources_employee['birthdate'], errors='coerce')
    EDA_humanresources_employee['maritalstatus'] = EDA_humanresources_employee['maritalstatus'].astype('str', errors='ignore')
    EDA_humanresources_employee['gender'] = EDA_humanresources_employee['gender'].astype('str', errors='ignore')
    EDA_humanresources_employee['hiredate'] = pd.to_datetime(EDA_humanresources_employee['hiredate'], errors='coerce')
    EDA_humanresources_employee['salariedflag'] = EDA_humanresources_employee['salariedflag'].astype('bool')
    EDA_humanresources_employee['vacationhours'] = EDA_humanresources_employee['vacationhours'].astype('int64', errors='ignore')
    EDA_humanresources_employee['sickleavehours'] = EDA_humanresources_employee['sickleavehours'].astype('int64', errors='ignore')
    EDA_humanresources_employee['currentflag'] = EDA_humanresources_employee['currentflag'].astype('bool')
    EDA_humanresources_employee['rowguid'] = EDA_humanresources_employee['rowguid'].astype('str', errors='ignore')
    EDA_humanresources_employee['modifieddate'] = pd.to_datetime(EDA_humanresources_employee['modifieddate'], errors='coerce')
    EDA_humanresources_employee['organizationnode'] = EDA_humanresources_employee['organizationnode'].astype('str', errors='ignore')

    print("Tipos atribuídos com sucesso!")
except Exception as e:
    print(f"Erro ao normalizar os dados JSON: {e}")
    EDA_humanresources_employee = EDA_humanresources_employee_raw  # Mantém os dados originais em caso de erro

print(f"Tabela processada: {input_table}")

print(EDA_humanresources_employee.info())

Normalizando os dados JSON...
Dados normalizados com sucesso!
Tipos atribuídos com sucesso!
Tabela processada: desafioadventureworks-446600.raw_data.humanresources_employee
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290 entries, 0 to 289
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   birthdate         290 non-null    datetime64[ns]
 1   businessentityid  290 non-null    int64         
 2   currentflag       290 non-null    bool          
 3   gender            290 non-null    object        
 4   hiredate          290 non-null    datetime64[ns]
 5   jobtitle          290 non-null    object        
 6   loginid           290 non-null    object        
 7   maritalstatus     290 non-null    object        
 8   modifieddate      287 non-null    datetime64[ns]
 9   nationalidnumber  290 non-null    int64         
 10  organizationnode  290 non-null    object        
 11  rowguid        

In [10]:
#dimensões do df antes de remover duplicatas

EDA_humanresources_employee.shape

(290, 15)

In [11]:
print("Colunas disponíveis no DataFrame limpo (cleaned):", EDA_humanresources_employee.columns)

# Identificar duplicatas com base em 'businessentityid'
duplicatas = EDA_humanresources_employee[
    EDA_humanresources_employee.duplicated(subset=['businessentityid'], keep=False)
]

# Verificar se existem duplicatas
if not duplicatas.empty:
    # Ordenar duplicatas por 'businessentityid' e 'modifieddate'
    duplicatas_ordenadas = duplicatas.sort_values(by=['businessentityid', 'modifieddate'])

    # Exibir duplicatas ordenadas
    print("Duplicatas ordenadas:")
    print(duplicatas_ordenadas)
else:
    print("Não foram encontradas duplicatas.")


Colunas disponíveis no DataFrame limpo (cleaned): Index(['birthdate', 'businessentityid', 'currentflag', 'gender', 'hiredate', 'jobtitle', 'loginid', 'maritalstatus', 'modifieddate', 'nationalidnumber', 'organizationnode', 'rowguid', 'salariedflag', 'sickleavehours', 'vacationhours'], dtype='object')
Não foram encontradas duplicatas.


In [12]:
# Remover duplicatas mantendo a última ocorrência com base em 'modifieddate'
EDA_humanresources_employee = EDA_humanresources_employee.drop_duplicates(subset=['businessentityid'], keep='last')

print(f"Linhas após remover duplicatas (baseando-se na última 'modifieddate'): {len(EDA_humanresources_employee)}")

#cópia dados brutos
raw_data_bkp_2_sem_duplicatas = EDA_humanresources_employee.copy()


Linhas após remover duplicatas (baseando-se na última 'modifieddate'): 290


In [13]:
# Ordenar o DataFrame por 'businessentityid' e 'modifieddate'
EDA_humanresources_employee = EDA_humanresources_employee.sort_values(by=['businessentityid', 'modifieddate'])

print(EDA_humanresources_employee)




     birthdate  businessentityid  currentflag gender   hiredate                       jobtitle                   loginid maritalstatus modifieddate  nationalidnumber organizationnode                               rowguid  salariedflag  sickleavehours  vacationhours
0   1969-01-29                 1         True      M 2009-01-14        Chief Executive Officer      adventure-works\ken0             S   2014-06-30         295847284                /  f01251e5-96a3-448d-981e-0f99d789110d          True              69             99
1   1971-08-01                 2         True      F 2008-01-31  Vice President of Engineering    adventure-works\terri0             S   2014-06-30         245797967              /1/  45e8f437-670d-4409-93cb-f9424a40d6ee          True              20              1
2   1974-11-12                 3         True      M 2007-11-11            Engineering Manager  adventure-works\roberto0             M   2014-06-30         509647174            /1/1/  9bbbfb2c-efbb-4217

In [14]:
#Certifique-se de que as colunas de datas está sendo reconhecida corretamente como contendo valores nulos (NaN em pandas). (Não pode object)

print(EDA_humanresources_employee.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290 entries, 0 to 289
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   birthdate         290 non-null    datetime64[ns]
 1   businessentityid  290 non-null    int64         
 2   currentflag       290 non-null    bool          
 3   gender            290 non-null    object        
 4   hiredate          290 non-null    datetime64[ns]
 5   jobtitle          290 non-null    object        
 6   loginid           290 non-null    object        
 7   maritalstatus     290 non-null    object        
 8   modifieddate      287 non-null    datetime64[ns]
 9   nationalidnumber  290 non-null    int64         
 10  organizationnode  290 non-null    object        
 11  rowguid           290 non-null    object        
 12  salariedflag      290 non-null    bool          
 13  sickleavehours    290 non-null    int64         
 14  vacationhours     290 non-

In [15]:
# Identificar as colunas de data
date_columns = ['birthdate', 'hiredate', 'modifieddate']

# Converter todas as colunas para datetime
for col in date_columns:
    EDA_humanresources_employee[col] = pd.to_datetime(
        EDA_humanresources_employee[col], errors='coerce'
    )

# Criar uma cópia do DataFrame para exportação no formato JSON
datas_formatadas = EDA_humanresources_employee.copy()

# Formatar colunas no formato ISO 8601 para BigQuery e tratar nulos como null
for col in date_columns:
    datas_formatadas[col] = datas_formatadas[col].apply(
        lambda x: x.isoformat() if pd.notnull(x) else None  # Certifique-se de que é datetime
    )

print(EDA_humanresources_employee.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290 entries, 0 to 289
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   birthdate         290 non-null    datetime64[ns]
 1   businessentityid  290 non-null    int64         
 2   currentflag       290 non-null    bool          
 3   gender            290 non-null    object        
 4   hiredate          290 non-null    datetime64[ns]
 5   jobtitle          290 non-null    object        
 6   loginid           290 non-null    object        
 7   maritalstatus     290 non-null    object        
 8   modifieddate      287 non-null    datetime64[ns]
 9   nationalidnumber  290 non-null    int64         
 10  organizationnode  290 non-null    object        
 11  rowguid           290 non-null    object        
 12  salariedflag      290 non-null    bool          
 13  sickleavehours    290 non-null    int64         
 14  vacationhours     290 non-

In [16]:
# Iterar por todas as colunas do DataFrame

for column in EDA_humanresources_employee.columns:
    # Verificar valores ausentes na coluna
    missing_rows = EDA_humanresources_employee[EDA_humanresources_employee[column].isnull()]
    print(f"Coluna '{column}': {missing_rows.shape[0]} linhas ausentes.")
    
    # Mostrar as primeiras linhas ausentes (limitar para não poluir a saída)
    if not missing_rows.empty:
        print(f"Exibindo as primeiras linhas com valores ausentes em '{column}':")
        print(missing_rows.head(), "\n")
    else:
        print(f"Nenhuma linha com valores ausentes em '{column}'.\n")



Coluna 'birthdate': 0 linhas ausentes.
Nenhuma linha com valores ausentes em 'birthdate'.

Coluna 'businessentityid': 0 linhas ausentes.
Nenhuma linha com valores ausentes em 'businessentityid'.

Coluna 'currentflag': 0 linhas ausentes.
Nenhuma linha com valores ausentes em 'currentflag'.

Coluna 'gender': 0 linhas ausentes.
Nenhuma linha com valores ausentes em 'gender'.

Coluna 'hiredate': 0 linhas ausentes.
Nenhuma linha com valores ausentes em 'hiredate'.

Coluna 'jobtitle': 0 linhas ausentes.
Nenhuma linha com valores ausentes em 'jobtitle'.

Coluna 'loginid': 0 linhas ausentes.
Nenhuma linha com valores ausentes em 'loginid'.

Coluna 'maritalstatus': 0 linhas ausentes.
Nenhuma linha com valores ausentes em 'maritalstatus'.

Coluna 'modifieddate': 3 linhas ausentes.
Exibindo as primeiras linhas com valores ausentes em 'modifieddate':
     birthdate  businessentityid  currentflag gender   hiredate                    jobtitle                  loginid maritalstatus modifieddate  nati

In [17]:
# Preencher 'modifieddate' ausente ou igual a 'hiredate', pois pode ser a ultima data de modificação no sistema.
EDA_humanresources_employee.loc[EDA_humanresources_employee['modifieddate'].isnull() | (EDA_humanresources_employee['modifieddate'] == pd.Timestamp('1900-01-01')), 'modifieddate'] = EDA_humanresources_employee['hiredate']

# Exibir as linhas ajustadas
print("Linhas onde 'modifieddate' foi ajustado para 'hiredate':")
print(EDA_humanresources_employee.loc[EDA_humanresources_employee['modifieddate'] == EDA_humanresources_employee['hiredate']])


Linhas onde 'modifieddate' foi ajustado para 'hiredate':
     birthdate  businessentityid  currentflag gender   hiredate                    jobtitle                  loginid maritalstatus modifieddate  nationalidnumber organizationnode                               rowguid  salariedflag  sickleavehours  vacationhours
25  1982-11-03                26         True      M 2008-12-01  Production Control Manager   adventure-works\peter0             M   2008-12-01         277173473            /3/1/  69d5d162-e817-45e7-9dec-5d9b8310e7b1          True              41             43
210 1977-10-26               211         True      M 2009-02-28   Quality Assurance Manager   adventure-works\hazem0             S   2009-02-28         398223854            /3/2/  05c84608-f445-4f9d-bb5c-0828c309c29d          True              60             80
221 1968-09-17               222         True      M 2008-12-12            Master Scheduler  adventure-works\ascott0             S   2008-12-12         68523

In [18]:
# Criar uma cópia do DataFrame para exportação no formato JSON
ajustes_date_time = EDA_humanresources_employee.copy()

In [19]:
# valores únicos por coluna

valores_unicos = EDA_humanresources_employee.nunique(dropna=False)

print("Valores únicos incluindo NaN:")
print(valores_unicos)

Valores únicos incluindo NaN:
birthdate           275
businessentityid    290
currentflag           1
gender                2
hiredate            164
jobtitle             67
loginid             290
maritalstatus         2
modifieddate          4
nationalidnumber    290
organizationnode    290
rowguid             290
salariedflag          2
sickleavehours       51
vacationhours       100
dtype: int64


In [20]:
# dropar colunas vazias

In [21]:
# Padronizar textos em title ou upper
EDA_humanresources_employee['jobtitle'] = EDA_humanresources_employee['jobtitle'].str.strip().str.title()
EDA_humanresources_employee['gender'] = EDA_humanresources_employee['gender'].str.strip().str.upper()
EDA_humanresources_employee['maritalstatus'] = EDA_humanresources_employee['maritalstatus'].str.strip().str.upper()


# Verificar valores únicos para garantir a padronização
print("Valores únicos em 'jobtitle':", EDA_humanresources_employee['jobtitle'].unique())
print("Valores únicos em 'gender':", EDA_humanresources_employee['gender'].unique())
print("Valores únicos em 'gender':", EDA_humanresources_employee['maritalstatus'].unique())


Valores únicos em 'jobtitle': ['Chief Executive Officer' 'Vice President Of Engineering'
 'Engineering Manager' 'Senior Tool Designer' 'Design Engineer'
 'Research And Development Manager' 'Research And Development Engineer'
 'Tool Designer' 'Senior Design Engineer' 'Marketing Manager'
 'Marketing Assistant' 'Marketing Specialist'
 'Vice President Of Production' 'Production Control Manager'
 'Production Supervisor - Wc60' 'Production Technician - Wc60'
 'Production Supervisor - Wc10' 'Production Technician - Wc10'
 'Production Supervisor - Wc50' 'Production Technician - Wc50'
 'Production Supervisor - Wc30' 'Production Technician - Wc30'
 'Production Supervisor - Wc40' 'Production Technician - Wc40'
 'Shipping And Receiving Supervisor' 'Stocker'
 'Shipping And Receiving Clerk' 'Production Supervisor - Wc20'
 'Production Technician - Wc20' 'Production Supervisor - Wc45'
 'Production Technician - Wc45' 'Quality Assurance Manager'
 'Quality Assurance Supervisor' 'Quality Assurance Technic

In [22]:
# Identificar colunas numéricas para análise 
numeric_columns = ['sickleavehours', 'vacationhours']

# Exibir estatísticas descritivas
print(EDA_humanresources_employee[numeric_columns].describe())

# Calcular limites para outliers (IQR - Intervalo Interquartil)
for col in numeric_columns:
    q1 = EDA_humanresources_employee[col].quantile(0.25)
    q3 = EDA_humanresources_employee[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    
    # Exibir os limites
    print(f"\nColuna: {col}")
    print(f"Limite inferior: {lower_bound}, Limite superior: {upper_bound}")
    
    # Filtrar outliers
    outliers = EDA_humanresources_employee[(EDA_humanresources_employee[col] < lower_bound) | (EDA_humanresources_employee[col] > upper_bound)]
    print(f"Outliers detectados ({len(outliers)}):")
    print(outliers[[col]])


       sickleavehours  vacationhours
count          290.00         290.00
mean            45.31          50.61
std             14.54          28.79
min             20.00           0.00
25%             33.00          26.25
50%             46.00          51.00
75%             58.00          75.00
max             80.00          99.00

Coluna: sickleavehours
Limite inferior: -4.5, Limite superior: 95.5
Outliers detectados (0):
Empty DataFrame
Columns: [sickleavehours]
Index: []

Coluna: vacationhours
Limite inferior: -46.875, Limite superior: 148.125
Outliers detectados (0):
Empty DataFrame
Columns: [vacationhours]
Index: []


In [23]:
# Definir regex para validar números (exemplo: apenas dígitos, 9 caracteres)
# acrescentei para ver se tinha um padrão, mas não tem
regex = r'^\d{9}$'

# Verificar valores inválidos
invalid_nationalid = EDA_humanresources_employee[~EDA_humanresources_employee['nationalidnumber'].astype(str).str.match(regex)]
print(f"Valores inválidos em 'nationalidnumber':\n{invalid_nationalid['nationalidnumber']}")


Valores inválidos em 'nationalidnumber':
13     42487730
14     56920285
15     24756624
18     52541318
21     95958330
23     72636981
27     14417807
37      6298838
45     66073987
46     33237992
52      9659517
56     10708100
60     92096924
64      8066363
66     63179277
69     36151748
81     58791499
84      1662732
86      7201901
88     90888098
89     82638150
112    54759846
131     1300049
133    45615666
136    63761469
137    25011600
142    56772045
153    97728960
163    65848458
165    60114406
172    87268837
197    19312190
209    20244403
229    28414965
240       30845
242    60517918
258    20269531
266    58317344
279    61161660
283    90836195
Name: nationalidnumber, dtype: int64


In [24]:
# Criar um backup do DataFrame tratado
EDA_humanresources_employee_bkp_v2 = EDA_humanresources_employee.copy()

# Verificar o tamanho do backup e as primeiras linhas
print(f"Backup criado com {len(EDA_humanresources_employee_bkp_v2)} linhas.")
print(EDA_humanresources_employee_bkp_v2.head())


Backup criado com 290 linhas.
   birthdate  businessentityid  currentflag gender   hiredate                       jobtitle                   loginid maritalstatus modifieddate  nationalidnumber organizationnode                               rowguid  salariedflag  sickleavehours  vacationhours
0 1969-01-29                 1         True      M 2009-01-14        Chief Executive Officer      adventure-works\ken0             S   2014-06-30         295847284                /  f01251e5-96a3-448d-981e-0f99d789110d          True              69             99
1 1971-08-01                 2         True      F 2008-01-31  Vice President Of Engineering    adventure-works\terri0             S   2014-06-30         245797967              /1/  45e8f437-670d-4409-93cb-f9424a40d6ee          True              20              1
2 1974-11-12                 3         True      M 2007-11-11            Engineering Manager  adventure-works\roberto0             M   2014-06-30         509647174            /1/

In [25]:
# Verificar e documentar colunas existentes
print("Colunas mantidas no dataset:", EDA_humanresources_employee.columns.tolist())


Colunas mantidas no dataset: ['birthdate', 'businessentityid', 'currentflag', 'gender', 'hiredate', 'jobtitle', 'loginid', 'maritalstatus', 'modifieddate', 'nationalidnumber', 'organizationnode', 'rowguid', 'salariedflag', 'sickleavehours', 'vacationhours']


In [26]:
# Listar colunas binárias esperadas
binary_columns = ['currentflag', 'salariedflag']

# Verificar valores únicos em colunas binárias
for col in binary_columns:
    unique_values = EDA_humanresources_employee[col].unique()
    print(f"Valores únicos em '{col}': {unique_values}") 



Valores únicos em 'currentflag': [ True]
Valores únicos em 'salariedflag': [ True False]


In [27]:
# Contar valores em 'currentflag' e 'salariedflag'
print("Distribuição de 'currentflag':")
print(EDA_humanresources_employee['currentflag'].value_counts())

print("\nDistribuição de 'salariedflag':")
print(EDA_humanresources_employee['salariedflag'].value_counts())


#se vale a pena deletar ou não a coluna currentflag, já que só tem 1 valor e é true ?!

Distribuição de 'currentflag':
currentflag
True    290
Name: count, dtype: int64

Distribuição de 'salariedflag':
salariedflag
False    238
True      52
Name: count, dtype: int64


In [28]:
# 1. Verificar se todos os funcionários ativos têm currentflag = True, pois deveria ser false = demitido/desligado
print("Funcionários ativos errados:", EDA_humanresources_employee[EDA_humanresources_employee['currentflag'] != True])

# 2. Validar datas
print("Contratações futuras:", EDA_humanresources_employee[EDA_humanresources_employee['hiredate'] > pd.Timestamp.now()])
print("Modifieddate antes de hiredate:", EDA_humanresources_employee[EDA_humanresources_employee['modifieddate'] < EDA_humanresources_employee['hiredate']])





Funcionários ativos errados: Empty DataFrame
Columns: [birthdate, businessentityid, currentflag, gender, hiredate, jobtitle, loginid, maritalstatus, modifieddate, nationalidnumber, organizationnode, rowguid, salariedflag, sickleavehours, vacationhours]
Index: []
Contratações futuras: Empty DataFrame
Columns: [birthdate, businessentityid, currentflag, gender, hiredate, jobtitle, loginid, maritalstatus, modifieddate, nationalidnumber, organizationnode, rowguid, salariedflag, sickleavehours, vacationhours]
Index: []
Modifieddate antes de hiredate: Empty DataFrame
Columns: [birthdate, businessentityid, currentflag, gender, hiredate, jobtitle, loginid, maritalstatus, modifieddate, nationalidnumber, organizationnode, rowguid, salariedflag, sickleavehours, vacationhours]
Index: []


In [29]:
# Ajustar o formato das colunas de data para atender ao BigQuery
EDA_humanresources_employee['modifieddate'] = pd.to_datetime(EDA_humanresources_employee['modifieddate'], errors='coerce').dt.date
EDA_humanresources_employee['birthdate'] = pd.to_datetime(EDA_humanresources_employee['birthdate'], errors='coerce').dt.date
EDA_humanresources_employee['hiredate'] = pd.to_datetime(EDA_humanresources_employee['hiredate'], errors='coerce').dt.date

# Atualizar o dicionário processed_data com o DataFrame ajustado
processed_data['humanresources_employee'] = EDA_humanresources_employee

# Exportar tabelas para o BigQuery no formato CSV
for table_name, df_cleaned in processed_data.items():
    output_table = f"{output_dataset}.{table_name}"
    
    print(f"Exportando tabela {table_name} para o BigQuery...")

    # Definir o esquema explicitamente
    schema = [
        bigquery.SchemaField("birthdate", "DATE"),
        bigquery.SchemaField("businessentityid", "INTEGER"),
        bigquery.SchemaField("currentflag", "BOOLEAN"),
        bigquery.SchemaField("gender", "STRING"),
        bigquery.SchemaField("hiredate", "DATE"),
        bigquery.SchemaField("jobtitle", "STRING"),
        bigquery.SchemaField("loginid", "STRING"),
        bigquery.SchemaField("maritalstatus", "STRING"),
        bigquery.SchemaField("modifieddate", "DATE"),
        bigquery.SchemaField("nationalidnumber", "INTEGER"),
        bigquery.SchemaField("organizationnode", "STRING"),
        bigquery.SchemaField("rowguid", "STRING"),
        bigquery.SchemaField("salariedflag", "BOOLEAN"),
        bigquery.SchemaField("sickleavehours", "INTEGER"),
        bigquery.SchemaField("vacationhours", "INTEGER"),
    ]

    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.CSV,
        skip_leading_rows=0,
        write_disposition="WRITE_TRUNCATE",
        schema=schema,  # Especifica os tipos de dados explicitamente
    )

    # Exportar para o BigQuery
    job = client.load_table_from_dataframe(df_cleaned, output_table, job_config=job_config)
    job.result()

    print(f"Tabela {table_name} exportada com sucesso para {output_table}.")


Exportando tabela humanresources_employee para o BigQuery...
Tabela humanresources_employee exportada com sucesso para desafioadventureworks-446600.raw_data_cleaned.humanresources_employee.


## ESTATÍSTICA DESCRITIVA

In [30]:
# Selecionar colunas relevantes para análise descritiva
cols_para_analise = ['sickleavehours', 'vacationhours']

# Garantir que as datas estejam no formato correto
EDA_humanresources_employee['hire_year'] = pd.to_datetime(EDA_humanresources_employee['hiredate']).dt.year

# Adicionar a nova coluna à lista
cols_para_analise.append('hire_year')

# Gerar estatísticas descritivas
analise_descritiva = EDA_humanresources_employee[cols_para_analise].describe(include='all')

# Substituir NaN em colunas numéricas por 0, e em outras colunas por '-'
for col in cols_para_analise:
    if analise_descritiva[col].dtype.kind in 'ifc':  # Tipos numéricos
        analise_descritiva[col] = analise_descritiva[col].fillna(0)
    else:
        analise_descritiva[col] = analise_descritiva[col].fillna('-')

# Gerar estatísticas descritivas
resultado_descritivo = analise_descritiva.describe(include='all')

print(analise_descritiva)


       sickleavehours  vacationhours  hire_year
count          290.00         290.00     290.00
mean            45.31          50.61    2009.02
std             14.54          28.79       1.01
min             20.00           0.00    2006.00
25%             33.00          26.25    2008.00
50%             46.00          51.00    2009.00
75%             58.00          75.00    2009.00
max             80.00          99.00    2013.00
