In [None]:
# Exploratory Data Analysis (EDA)

In this notebook, we will:
1. Fetch the raw `sample.csv` artifact from W&B.
2. Generate an initial profiling report to inspect distributions, missing values, and data types.
3. Apply basic cleaning (drop price outliers and convert `last_review` to datetime).
4. Verify that our cleaning fixed the issues by inspecting the dataframe again and re-profiling.


In [None]:
import wandb
import pandas as pd

run = wandb.init(project="nyc_airbnb", group="eda", save_code=True)
local_path = wandb.use_artifact("sample.csv:latest").file()
df = pd.read_csv(local_path)


In [None]:
import pandas_profiling

profile = pandas_profiling.ProfileReport(df)
profile.to_widgets()


In [None]:
# Drop price outliers between $10 and $350
min_price = 10
max_price = 350
idx = df["price"].between(min_price, max_price)
df = df[idx].copy()

# Convert last_review to datetime
df["last_review"] = pd.to_datetime(df["last_review"])


In [None]:
# Verify dataframe schema and missing values after cleaning
df.info()


In [None]:
# Re-profile the cleaned dataframe to confirm fixes
profile_clean = pandas_profiling.ProfileReport(df)
profile_clean.to_widgets()
