### Data Cleaning

In [1]:
##import das bibliotecas e adequando colunas, linhas e formato de números

from google.cloud import bigquery
from dotenv import load_dotenv
load_dotenv()

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import re

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/mnt/c/Temp/desafiolh-445818-3cb0f62cb9ef.json"


# Configurar Pandas para exibir todas as colunas e todas as linhas completas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)


pd.options.display.float_format = '{:.2f}'.format

In [2]:
# Configurar o cliente do BigQuery
client = bigquery.Client()

# Nome do dataset e tabela
dataset_id = 'raw_data'

In [3]:
# Listar tabelas no dataset
tables = client.list_tables('raw_data')
print("Tabelas disponíveis:")
for table in tables:
    print(table.table_id)




Tabelas disponíveis:


humanresources_employee
person_address
person_businessentity
person_person
production_location
production_product
production_productcategory
production_productinventory
production_productsubcategory
sales_customer
sales_salesorderdetail
sales_salesorderheader
sales_salesterritory
sales_store


In [4]:
query = f"SELECT * FROM `raw_data.humanresources_employee`"
data = client.query(query).result().to_dataframe()

# Expandir a coluna JSON
raw_data = pd.json_normalize(data['data'])

# Exibir os dados expandidos
print(raw_data.sample(n=50))




       birthdate  businessentityid  currentflag gender    hiredate                                  jobtitle                     loginid maritalstatus         modifieddate nationalidnumber organizationnode                               rowguid  salariedflag  sickleavehours  vacationhours
147   1984-07-31                36         True      M  2009-02-10              Production Technician - WC60       adventure-works\jose0             M  2014-06-30T00:00:00        788456780        /3/1/1/9/  9e912556-88ba-41ee-b946-cb84ab4c1102         False              30             20
75    1989-06-25               194         True      M  2008-12-12              Production Technician - WC40       adventure-works\fred0             S  2014-06-30T00:00:00        295971920       /3/1/21/2/  45358ae8-0b0e-4c11-90bb-dac3ec0d5c82         False              43             47
1051  1972-07-24               189         True      F  2009-01-15              Production Technician - WC45     adventure-works\jane

### Verificando valores nulos/em branco nos dados

In [5]:
#valores nulos

raw_data.isnull().sum()

birthdate           0
businessentityid    0
currentflag         0
gender              0
hiredate            0
jobtitle            0
loginid             0
maritalstatus       0
modifieddate        0
nationalidnumber    0
organizationnode    0
rowguid             0
salariedflag        0
sickleavehours      0
vacationhours       0
dtype: int64

In [6]:
#total valores unicos de cada variável

valores_unicos = []

for i in raw_data.columns[0:15].tolist():
    print(i, ':', len(raw_data[i].astype(str).value_counts()))
    valores_unicos.append(len(raw_data[i].astype(str).value_counts()))

birthdate : 275
businessentityid : 290
currentflag : 1
gender : 2
hiredate : 164
jobtitle : 67
loginid : 290
maritalstatus : 2
modifieddate : 2
nationalidnumber : 290
organizationnode : 290
rowguid : 290
salariedflag : 2
sickleavehours : 51
vacationhours : 100


In [7]:
# Identificar duplicatas com base em 'businessentityid'
duplicatas = raw_data[raw_data.duplicated(subset=['businessentityid'], keep=False)]

# Verificar se existem duplicatas
if not duplicatas.empty:
    # Ordenar duplicatas por 'businessentityid' e 'modifieddate'
    duplicatas_ordenadas = duplicatas.sort_values(by=['businessentityid', 'modifieddate'])
    
    # Exibir duplicatas ordenadas
    print("Duplicatas ordenadas:")
    print(duplicatas_ordenadas)
else:
    print("Não foram encontradas duplicatas.")


Duplicatas ordenadas:


       birthdate  businessentityid  currentflag gender    hiredate                                  jobtitle                       loginid maritalstatus                modifieddate nationalidnumber organizationnode                               rowguid  salariedflag  sickleavehours  vacationhours
122   1969-01-29                 1         True      M  2009-01-14                   Chief Executive Officer          adventure-works\ken0             S         2014-06-30T00:00:00        295847284                /  f01251e5-96a3-448d-981e-0f99d789110d          True              69             99
436   1969-01-29                 1         True      M  2009-01-14                   Chief Executive Officer          adventure-works\ken0             S         2014-06-30T00:00:00        295847284                /  f01251e5-96a3-448d-981e-0f99d789110d          True              69             99
603   1969-01-29                 1         True      M  2009-01-14                   Chief Executive Offic

In [8]:
print(duplicatas_ordenadas.drop_duplicates())


      birthdate  businessentityid  currentflag gender    hiredate                                  jobtitle                       loginid maritalstatus                modifieddate nationalidnumber organizationnode                               rowguid  salariedflag  sickleavehours  vacationhours
122  1969-01-29                 1         True      M  2009-01-14                   Chief Executive Officer          adventure-works\ken0             S         2014-06-30T00:00:00        295847284                /  f01251e5-96a3-448d-981e-0f99d789110d          True              69             99
123  1971-08-01                 2         True      F  2008-01-31             Vice President of Engineering        adventure-works\terri0             S         2014-06-30T00:00:00        245797967              /1/  45e8f437-670d-4409-93cb-f9424a40d6ee          True              20              1
125  1974-11-12                 3         True      M  2007-11-11                       Engineering Manager  

In [9]:
duplicados_businessentityid = raw_data[raw_data.duplicated(subset=['businessentityid'], keep=False)]

# Ordenar por 'businessentityid' para facilitar a análise
duplicados_ordenados = duplicados_businessentityid.sort_values(by=['businessentityid'])

# Exibir todas as linhas duplicadas
print(duplicados_ordenados)


       birthdate  businessentityid  currentflag gender    hiredate                                  jobtitle                       loginid maritalstatus                modifieddate nationalidnumber organizationnode                               rowguid  salariedflag  sickleavehours  vacationhours
1889  1969-01-29                 1         True      M  2009-01-14                   Chief Executive Officer          adventure-works\ken0             S         2014-06-30T00:00:00        295847284                /  f01251e5-96a3-448d-981e-0f99d789110d          True              69             99
1890  1969-01-29                 1         True      M  2009-01-14                   Chief Executive Officer          adventure-works\ken0             S         2014-06-30T00:00:00        295847284                /  f01251e5-96a3-448d-981e-0f99d789110d          True              69             99
1888  1969-01-29                 1         True      M  2009-01-14                   Chief Executive Offic

In [10]:
pd.set_option('display.max_rows', None)
contagem = raw_data['businessentityid'].value_counts()

# Filtrar apenas os IDs que aparecem mais de uma vez
repetidos = contagem[contagem > 1]

# Exibir repetidos novamente
print(repetidos)

businessentityid
289    8
15     8
217    8
211    8
214    8
210    8
213    8
221    8
224    8
219    8
192    8
201    8
196    8
199    8
206    8
208    8
207    8
205    8
184    8
188    8
181    8
180    8
177    8
202    8
200    8
203    8
167    8
168    8
166    8
171    8
175    8
174    8
178    8
182    8
152    8
161    8
163    8
160    8
165    8
173    8
164    8
176    8
139    8
129    8
131    8
151    8
146    8
156    8
153    8
145    8
120    8
110    8
124    8
128    8
133    8
127    8
132    8
130    8
117    8
115    8
113    8
114    8
116    8
119    8
122    8
121    8
231    8
235    8
233    8
244    8
236    8
237    8
245    8
255    8
212    8
216    8
215    8
220    8
222    8
225    8
227    8
228    8
197    8
194    8
195    8
193    8
204    8
198    8
218    8
209    8
189    8
187    8
183    8
185    8
186    8
179    8
191    8
190    8
1      8
2      8
4      8
3      8
5      8
6      8
7      8
10     8
275    8
278    8
280    8
27

In [11]:
#copia da humanresources_employee
raw_data_bkp = raw_data.copy()

# Ordenar o DataFrame por 'businessentityid' e 'modifieddate'
raw_data = raw_data.sort_values(by=['businessentityid', 'modifieddate'])

# Remover duplicatas mantendo a última ocorrência com base em 'modifieddate'
raw_data = raw_data.drop_duplicates(subset=['businessentityid'], keep='last')

print(f"Linhas após remover duplicatas (baseando-se na última 'modifieddate'): {len(raw_data)}")


Linhas após remover duplicatas (baseando-se na última 'modifieddate'): 290


In [12]:
# Verificar informações do DataFrame
print(raw_data.info())


<class 'pandas.core.frame.DataFrame'>
Index: 290 entries, 1890 to 2319
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   birthdate         290 non-null    object
 1   businessentityid  290 non-null    int64 
 2   currentflag       290 non-null    bool  
 3   gender            290 non-null    object
 4   hiredate          290 non-null    object
 5   jobtitle          290 non-null    object
 6   loginid           290 non-null    object
 7   maritalstatus     290 non-null    object
 8   modifieddate      290 non-null    object
 9   nationalidnumber  290 non-null    object
 10  organizationnode  290 non-null    object
 11  rowguid           290 non-null    object
 12  salariedflag      290 non-null    bool  
 13  sickleavehours    290 non-null    int64 
 14  vacationhours     290 non-null    int64 
dtypes: bool(2), int64(3), object(10)
memory usage: 32.3+ KB
None


In [13]:
# Identificar colunas com datas
date_columns = ['birthdate', 'hiredate', 'modifieddate']

# Converter as colunas para datetime se ainda não estiverem
for col in date_columns:
    raw_data[col] = pd.to_datetime(raw_data[col], errors='coerce')

# Criar uma cópia do DataFrame para exibição formatada
formatted_data = raw_data.copy()

# Formatar todas as colunas de datas para exibição
for col in date_columns:
    formatted_data[col] = raw_data[col].dt.strftime('%Y-%m-%d %H:%M:%S')

# Exibir o DataFrame formatado
print(formatted_data.head())

# Verificar os tipos originais permanecem datetime64[ns]
print("\nTipos originais das colunas no DataFrame principal:")
print(raw_data[date_columns].dtypes)


                birthdate  businessentityid  currentflag gender             hiredate                       jobtitle                   loginid maritalstatus         modifieddate nationalidnumber organizationnode                               rowguid  salariedflag  sickleavehours  vacationhours
1890  1969-01-29 00:00:00                 1         True      M  2009-01-14 00:00:00        Chief Executive Officer      adventure-works\ken0             S  2014-06-30 00:00:00        295847284                /  f01251e5-96a3-448d-981e-0f99d789110d          True              69             99
1893  1971-08-01 00:00:00                 2         True      F  2008-01-31 00:00:00  Vice President of Engineering    adventure-works\terri0             S  2014-06-30 00:00:00        245797967              /1/  45e8f437-670d-4409-93cb-f9424a40d6ee          True              20              1
1452  1974-11-12 00:00:00                 3         True      M  2007-11-11 00:00:00            Engineering Manager  a

In [14]:
# Padronizar textos em jobtitle e gender
raw_data['jobtitle'] = raw_data['jobtitle'].str.strip().str.title()
raw_data['gender'] = raw_data['gender'].str.strip().str.upper()

# Verificar valores únicos para garantir a padronização
print("Valores únicos em 'jobtitle':", raw_data['jobtitle'].unique())
print("Valores únicos em 'gender':", raw_data['gender'].unique())


Valores únicos em 'jobtitle': ['Chief Executive Officer' 'Vice President Of Engineering'
 'Engineering Manager' 'Senior Tool Designer' 'Design Engineer'
 'Research And Development Manager' 'Research And Development Engineer'
 'Tool Designer' 'Senior Design Engineer' 'Marketing Manager'
 'Marketing Assistant' 'Marketing Specialist'
 'Vice President Of Production' 'Production Control Manager'
 'Production Supervisor - Wc60' 'Production Technician - Wc60'
 'Production Supervisor - Wc10' 'Production Technician - Wc10'
 'Production Supervisor - Wc50' 'Production Technician - Wc50'
 'Production Supervisor - Wc30' 'Production Technician - Wc30'
 'Production Supervisor - Wc40' 'Production Technician - Wc40'
 'Shipping And Receiving Supervisor' 'Stocker'
 'Shipping And Receiving Clerk' 'Production Supervisor - Wc20'
 'Production Technician - Wc20' 'Production Supervisor - Wc45'
 'Production Technician - Wc45' 'Quality Assurance Manager'
 'Quality Assurance Supervisor' 'Quality Assurance Technic

In [15]:
# Identificar colunas numéricas para análise 
numeric_columns = ['sickleavehours', 'vacationhours']

# Exibir estatísticas descritivas
print(raw_data[numeric_columns].describe())

# Calcular limites para outliers (IQR - Intervalo Interquartil)
for col in numeric_columns:
    q1 = raw_data[col].quantile(0.25)
    q3 = raw_data[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    
    # Exibir os limites
    print(f"\nColuna: {col}")
    print(f"Limite inferior: {lower_bound}, Limite superior: {upper_bound}")
    
    # Filtrar outliers
    outliers = raw_data[(raw_data[col] < lower_bound) | (raw_data[col] > upper_bound)]
    print(f"Outliers detectados ({len(outliers)}):")
    print(outliers[[col]])


       sickleavehours  vacationhours
count          290.00         290.00
mean            45.31          50.61
std             14.54          28.79
min             20.00           0.00
25%             33.00          26.25
50%             46.00          51.00
75%             58.00          75.00
max             80.00          99.00

Coluna: sickleavehours
Limite inferior: -4.5, Limite superior: 95.5
Outliers detectados (0):
Empty DataFrame
Columns: [sickleavehours]
Index: []

Coluna: vacationhours
Limite inferior: -46.875, Limite superior: 148.125
Outliers detectados (0):
Empty DataFrame
Columns: [vacationhours]
Index: []


In [16]:

# Criar a coluna 'hire_year' com base em 'hiredate'
raw_data['hire_year'] = raw_data['hiredate'].dt.year


#verificando outros dados para detectar outliers
anos_contratacao = [int(ano) for ano in raw_data['hire_year'].unique()]
anos_contratacao.sort()
print(anos_contratacao)

print("Valores únicos em 'gender':", raw_data['gender'].unique())
print("Valores únicos em 'salariedflag':", raw_data['salariedflag'].unique())


[2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013]
Valores únicos em 'gender': ['M' 'F']
Valores únicos em 'salariedflag': [ True False]


In [17]:
# Verificar valores ausentes
missing = raw_data.isnull().sum()
print("Valores ausentes por coluna:")
print(missing)

# Tratar colunas críticas
if missing['hiredate'] > 0:
    print("Tratar valores ausentes em 'hiredate' (decisão: remover ou imputar)")

if missing['jobtitle'] > 0:
    print("Tratar valores ausentes em 'jobtitle' (decisão: remover ou imputar)")

if missing['businessentityid'] > 0:
    print("Erro crítico: 'businessentityid' não pode ter valores ausentes!")


Valores ausentes por coluna:
birthdate           0
businessentityid    0
currentflag         0
gender              0
hiredate            0
jobtitle            0
loginid             0
maritalstatus       0
modifieddate        3
nationalidnumber    0
organizationnode    0
rowguid             0
salariedflag        0
sickleavehours      0
vacationhours       0
hire_year           0
dtype: int64


In [18]:
# Verificar linhas com 'modifieddate' ausente
missing_modifieddate = raw_data[raw_data['modifieddate'].isnull()]
print("Linhas com 'modifieddate' ausente:")
print(missing_modifieddate)


Linhas com 'modifieddate' ausente:
      birthdate  businessentityid  currentflag gender   hiredate                    jobtitle                  loginid maritalstatus modifieddate nationalidnumber organizationnode                               rowguid  salariedflag  sickleavehours  vacationhours  hire_year
1491 1982-11-03                26         True      M 2008-12-01  Production Control Manager   adventure-works\peter0             M          NaT        277173473            /3/1/  69d5d162-e817-45e7-9dec-5d9b8310e7b1          True              41             43       2008
2217 1977-10-26               211         True      M 2009-02-28   Quality Assurance Manager   adventure-works\hazem0             S          NaT        398223854            /3/2/  05c84608-f445-4f9d-bb5c-0828c309c29d          True              60             80       2009
2232 1968-09-17               222         True      M 2008-12-12            Master Scheduler  adventure-works\ascott0             S          NaT  

In [19]:
# Preencher 'modifieddate' ausente ou igual a 'hiredate', pois pode ser a ultima data de modificação no sistema.
raw_data.loc[raw_data['modifieddate'].isnull() | (raw_data['modifieddate'] == pd.Timestamp('1900-01-01')), 'modifieddate'] = raw_data['hiredate']

# Exibir as linhas ajustadas
print("Linhas onde 'modifieddate' foi ajustado para 'hiredate':")
print(raw_data.loc[raw_data['modifieddate'] == raw_data['hiredate']])


Linhas onde 'modifieddate' foi ajustado para 'hiredate':
      birthdate  businessentityid  currentflag gender   hiredate                    jobtitle                  loginid maritalstatus modifieddate nationalidnumber organizationnode                               rowguid  salariedflag  sickleavehours  vacationhours  hire_year
1491 1982-11-03                26         True      M 2008-12-01  Production Control Manager   adventure-works\peter0             M   2008-12-01        277173473            /3/1/  69d5d162-e817-45e7-9dec-5d9b8310e7b1          True              41             43       2008
2217 1977-10-26               211         True      M 2009-02-28   Quality Assurance Manager   adventure-works\hazem0             S   2009-02-28        398223854            /3/2/  05c84608-f445-4f9d-bb5c-0828c309c29d          True              60             80       2009
2232 1968-09-17               222         True      M 2008-12-12            Master Scheduler  adventure-works\ascott0       

In [20]:
# Verificar unicidade de 'businessentityid'
is_unique = raw_data['businessentityid'].is_unique
print(f"'businessentityid' é único? {is_unique}")


'businessentityid' é único? True


In [21]:
# Definir regex para validar números (exemplo: apenas dígitos, 9 caracteres)
regex = r'^\d{9}$'

# Verificar valores inválidos
invalid_nationalid = raw_data[~raw_data['nationalidnumber'].astype(str).str.match(regex)]
print(f"Valores inválidos em 'nationalidnumber':\n{invalid_nationalid['nationalidnumber']}")


Valores inválidos em 'nationalidnumber':
1905    42487730
1476    56920285
1908    24756624
1917    52541318
1920    95958330
1488    72636981
1494    14417807
1518     6298838
1947    66073987
1950    33237992
1539     9659517
1959    10708100
1968    92096924
1557     8066363
1980    63179277
1560    36151748
2004    58791499
2007     1662732
1590     7201901
2013    90888098
2016    82638150
1623    54759846
1644     1300049
1647    45615666
2094    63761469
1656    25011600
2103    56772045
1683    97728960
2136    65848458
2139    60114406
2154    87268837
1734    19312190
2214    20244403
1794    28414965
1812       30845
2250    60517918
1842    20269531
2289    58317344
2307    61161660
1878    90836195
Name: nationalidnumber, dtype: object


In [22]:
regex_email = r'^[\w\.-]+@[\w\.-]+\.\w+$'
invalid_loginid = raw_data[~raw_data['loginid'].str.match(regex_email)]
print(f"Valores inválidos em 'loginid':\n{invalid_loginid['loginid']}")


Valores inválidos em 'loginid':
1890            adventure-works\ken0
1893          adventure-works\terri0
1452        adventure-works\roberto0
1896            adventure-works\rob0
1455           adventure-works\gail0
1458         adventure-works\jossef0
1461          adventure-works\dylan0
1899          adventure-works\diane1
1464           adventure-works\gigi0
1467        adventure-works\michael6
1902         adventure-works\ovidiu0
1470        adventure-works\thierry0
1473         adventure-works\janice0
1905        adventure-works\michael8
1476         adventure-works\sharon0
1908          adventure-works\david0
1911          adventure-works\kevin0
1914           adventure-works\john5
1917           adventure-works\mary2
1479         adventure-works\wanida0
1482          adventure-works\terry0
1920         adventure-works\sariya0
1485           adventure-works\mary0
1488           adventure-works\jill0
1923          adventure-works\james1
1491          adventure-works\peter0
1926  

In [23]:
is_unique = raw_data['nationalidnumber'].is_unique
print(f"'nationalidnumber' é único? {is_unique}")


'nationalidnumber' é único? True


In [24]:
# Criar um backup do DataFrame tratado
raw_data_bkp_v2 = raw_data.copy()

# Verificar o tamanho do backup e as primeiras linhas
print(f"Backup criado com {len(raw_data_bkp_v2)} linhas.")
print(raw_data_bkp_v2.head())


Backup criado com 290 linhas.
      birthdate  businessentityid  currentflag gender   hiredate                       jobtitle                   loginid maritalstatus modifieddate nationalidnumber organizationnode                               rowguid  salariedflag  sickleavehours  vacationhours  hire_year
1890 1969-01-29                 1         True      M 2009-01-14        Chief Executive Officer      adventure-works\ken0             S   2014-06-30        295847284                /  f01251e5-96a3-448d-981e-0f99d789110d          True              69             99       2009
1893 1971-08-01                 2         True      F 2008-01-31  Vice President Of Engineering    adventure-works\terri0             S   2014-06-30        245797967              /1/  45e8f437-670d-4409-93cb-f9424a40d6ee          True              20              1       2008
1452 1974-11-12                 3         True      M 2007-11-11            Engineering Manager  adventure-works\roberto0             M   2

In [25]:
# Verificar e documentar colunas existentes
print("Colunas mantidas no dataset:", raw_data.columns.tolist())


Colunas mantidas no dataset: ['birthdate', 'businessentityid', 'currentflag', 'gender', 'hiredate', 'jobtitle', 'loginid', 'maritalstatus', 'modifieddate', 'nationalidnumber', 'organizationnode', 'rowguid', 'salariedflag', 'sickleavehours', 'vacationhours', 'hire_year']


In [26]:
for col in raw_data.columns:
    print(f"Valores únicos em '{col}':", raw_data[col].unique()[:10])  # Limitar a exibição a 10 valores



Valores únicos em 'birthdate': <DatetimeArray>
['1969-01-29 00:00:00', '1971-08-01 00:00:00', '1974-11-12 00:00:00', '1974-12-23 00:00:00', '1952-09-27 00:00:00', '1959-03-11 00:00:00', '1987-02-24 00:00:00', '1986-06-05 00:00:00', '1979-01-21 00:00:00', '1984-11-30 00:00:00']
Length: 10, dtype: datetime64[ns]
Valores únicos em 'businessentityid': [ 1  2  3  4  5  6  7  8  9 10]
Valores únicos em 'currentflag': [ True]
Valores únicos em 'gender': ['M' 'F']
Valores únicos em 'hiredate': <DatetimeArray>
['2009-01-14 00:00:00', '2008-01-31 00:00:00', '2007-11-11 00:00:00', '2007-12-05 00:00:00', '2008-01-06 00:00:00', '2008-01-24 00:00:00', '2009-02-08 00:00:00', '2008-12-29 00:00:00', '2009-01-16 00:00:00', '2009-05-03 00:00:00']
Length: 10, dtype: datetime64[ns]
Valores únicos em 'jobtitle': ['Chief Executive Officer' 'Vice President Of Engineering'
 'Engineering Manager' 'Senior Tool Designer' 'Design Engineer'
 'Research And Development Manager' 'Research And Development Engineer'
 'T

In [27]:
# Listar colunas binárias esperadas
binary_columns = ['currentflag', 'salariedflag']

# Verificar valores únicos em colunas binárias
for col in binary_columns:
    unique_values = raw_data[col].unique()
    print(f"Valores únicos em '{col}': {unique_values}")

    # Corrigir valores não binários, se necessário
    if not set(unique_values).issubset({True, False, 0, 1}):
        pr



Valores únicos em 'currentflag': [ True]
Valores únicos em 'salariedflag': [ True False]


In [28]:
# Contar valores em 'currentflag' e 'salariedflag'
print("Distribuição de 'currentflag':")
print(raw_data['currentflag'].value_counts())

print("\nDistribuição de 'salariedflag':")
print(raw_data['salariedflag'].value_counts())


Distribuição de 'currentflag':
currentflag
True    290
Name: count, dtype: int64

Distribuição de 'salariedflag':
salariedflag
False    238
True      52
Name: count, dtype: int64


In [29]:
# 1. Verificar se todos os funcionários ativos têm currentflag = True, pois deveria ser false = demitido/desligado
print("Funcionários ativos errados:", raw_data[raw_data['currentflag'] != True])

# 2. Validar datas
print("Contratações futuras:", raw_data[raw_data['hiredate'] > pd.Timestamp.now()])
print("Modifieddate antes de hiredate:", raw_data[raw_data['modifieddate'] < raw_data['hiredate']])

# 3. Verificar unicidade de identificadores
print("Duplicados em 'businessentityid':", raw_data['businessentityid'].duplicated().sum())
print("Duplicados em 'nationalidnumber':", raw_data['nationalidnumber'].duplicated().sum())



Funcionários ativos errados: Empty DataFrame
Columns: [birthdate, businessentityid, currentflag, gender, hiredate, jobtitle, loginid, maritalstatus, modifieddate, nationalidnumber, organizationnode, rowguid, salariedflag, sickleavehours, vacationhours, hire_year]
Index: []
Contratações futuras: Empty DataFrame
Columns: [birthdate, businessentityid, currentflag, gender, hiredate, jobtitle, loginid, maritalstatus, modifieddate, nationalidnumber, organizationnode, rowguid, salariedflag, sickleavehours, vacationhours, hire_year]
Index: []
Modifieddate antes de hiredate: Empty DataFrame
Columns: [birthdate, businessentityid, currentflag, gender, hiredate, jobtitle, loginid, maritalstatus, modifieddate, nationalidnumber, organizationnode, rowguid, salariedflag, sickleavehours, vacationhours, hire_year]
Index: []
Duplicados em 'businessentityid': 0
Duplicados em 'nationalidnumber': 0


## ESTATÍSTICA DESCRITIVA

In [30]:
# Selecionar colunas relevantes para análise descritiva
cols_para_analise = ['sickleavehours', 'vacationhours', 'salariedflag']

# Garantir que as datas estejam no formato correto
raw_data['hire_year'] = pd.to_datetime(raw_data['hiredate']).dt.year

# Adicionar a nova coluna à lista
cols_para_analise.append('hire_year')

# Gerar estatísticas descritivas
analise_descritiva = raw_data[cols_para_analise].describe(include='all')

# Substituir NaN por '-'
analise_descritiva = analise_descritiva.fillna('-')

print(analise_descritiva)


       sickleavehours vacationhours salariedflag hire_year
count          290.00        290.00          290    290.00
unique              -             -            2         -
top                 -             -        False         -
freq                -             -          238         -
mean            45.31         50.61            -   2009.02
std             14.54         28.79            -      1.01
min             20.00          0.00            -   2006.00
25%             33.00         26.25            -   2008.00
50%             46.00         51.00            -   2009.00
75%             58.00         75.00            -   2009.00
max             80.00         99.00            -   2013.00
