# 1.1. log raw dataset

In [1]:
import wandb
from utils.wandb_utils import wandb_log_artifact, get_wandb_artifact

In [2]:
# init wandb run
run = wandb.init(project="ex_census_wandb", job_type='upload_dataset')

[34m[1mwandb[0m: Currently logged in as: [33mwg_lucas[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
# log raw dataset
wandb_log_artifact(run, 
                   artifact_name="census_raw", 
                   description="raw data",
                   file_path=["./../dataset/census_raw.csv"],
                   remove_logged_file=False)

# 1.2. 1st cleaning + log cleaned dataset

In [4]:
import pandas as pd

In [5]:
# load raw dataset
_, file_path = get_wandb_artifact(run,
                               artifact_name="census_raw", 
                               file_name="census_raw.csv", 
                               tag="latest")

print(file_path)
data = pd.read_csv(file_path, sep=',', encoding='utf-8')

./artifacts/census_raw:v0/census_raw.csv


In [6]:
# cleaning column name, data
data.columns = [c.strip() for c in data.columns]

for col in data.columns:
    if data[col].dtype == 'O':
        data[col] = data[col].apply(lambda x: x.strip())
        
# convert ? to NA
data.replace({"?": None}, inplace=True)

In [7]:
# save + log dataset
data.to_csv("./../dataset/census_cleaned.csv", sep='\t', encoding='utf-8', index=False)

wandb_log_artifact(run, 
                   artifact_name="census_cleaned", 
                   description="basic white space cleaning",
                   file_path=["./../dataset/census_cleaned.csv"],
                   remove_logged_file=True)

# saved as census_cleaned:v0 in wandb

# 1.3. profile dataset + 2nd cleaning + log dataset

In [8]:
import pandas_profiling

In [None]:
profile = pandas_profiling.ProfileReport(data)
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
# drop values that contains "?"
data.dropna(inplace=True)

# drop irrelevant value
data.drop(columns="fnlgt", inplace=True)

# drop highly correlated numerical feature
data.drop(columns=["education-num"], inplace=True)

# drop features with mostly zero
data.drop(columns=["age", "capital-gain", "capital-loss"], inplace=True)

In [None]:
# overwrite
data.to_csv('./../dataset/census_cleaned.csv', sep='\t', encoding='utf-8', index=False)

In [None]:
wandb_log_artifact(run, 
                   artifact_name="census_cleaned", 
                   description="cleaning based on eda. null, zero dominant features, irrelevant feature, high correlation features", 
                   file_path=["./../dataset/census_cleaned.csv"],
                   remove_logged_file=True)

# saved as census_cleaned:v1 in wandb

In [None]:
run.finish()