### Get data and use auto analysis

In [1]:
import wandb
import pandas as pd

run = wandb.init(project="nyc_airbnb", group="eda", save_code=True)
local_path = wandb.use_artifact("sample.csv:latest").file()
df = pd.read_csv(local_path)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33myurimarca[0m ([33myurimarca-ai[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,9138664,Private Lg Room 15 min to Manhattan,47594947,Iris,Queens,Sunnyside,40.74271,-73.92493,Private room,74,2,6,2019-05-26,0.13,1,5
1,31444015,TIME SQUARE CHARMING ONE BED IN HELL'S KITCHEN...,8523790,Johlex,Manhattan,Hell's Kitchen,40.76682,-73.98878,Entire home/apt,170,3,0,,,1,188
2,8741020,Voted #1 Location Quintessential 1BR W Village...,45854238,John,Manhattan,West Village,40.73631,-74.00611,Entire home/apt,245,3,51,2018-09-19,1.12,1,0
3,34602077,Spacious 1 bedroom apartment 15min from Manhattan,261055465,Regan,Queens,Astoria,40.76424,-73.92351,Entire home/apt,125,3,1,2019-05-24,0.65,1,13
4,23203149,Big beautiful bedroom in huge Bushwick apartment,143460,Megan,Brooklyn,Bushwick,40.69839,-73.92044,Private room,65,2,8,2019-06-23,0.52,2,8


In [3]:
df.describe(include=[object])

Unnamed: 0,name,host_name,neighbourhood_group,neighbourhood,room_type,last_review
count,19993,19992,20000,20000,20000,15877
unique,19768,6517,5,217,3,1507
top,Hillside Hotel,David,Manhattan,Williamsburg,Entire home/apt,2019-06-23
freq,7,170,8774,1580,10384,575


In [4]:
import ydata_profiling

profile = ydata_profiling.ProfileReport(df)
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [5]:
# Export the report to an HTML file so we can upload as an artifact
filename = "profile-report.html"
profile.to_file(filename)

artifact = wandb.Artifact(
    name=filename, 
    type="analysis", 
    description="HTML file produced from ydata-profiling python package for data analysis."
)
artifact.add_file(filename)
run.log_artifact(artifact)

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

<Artifact profile-report.html>

- Sadly we cannot visualize html artifacts in W&B, which is possible using mlflow experiment tracking UI.
- I tried to log the profile html file here, but I couldn't open in W&B.

### Alerts generated by the package

- `host_id` is highly overall correlated with `id`
- `latitude` is highly overall correlated with `neighbourhood_group`
- `longitude` is highly overall correlated with `neighbourhood_group`
- `number_of_reviews` is highly overall correlated with `reviews_per_month`
- `last_review` has 4123 (20.6%) missing values
- `reviews_per_month` has 4123 (20.6%) missing values
- `minimum_nights` is highly skewed (γ1 = 25.17996962)
- `number_of_reviews` has 4123 (20.6%) zeros
- `availability_365` has 7176 (35.9%) zeros

### Missing values

In [6]:
df.isna().sum()

id                                   0
name                                 7
host_id                              0
host_name                            8
neighbourhood_group                  0
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
last_review                       4123
reviews_per_month                 4123
calculated_host_listings_count       0
availability_365                     0
dtype: int64

In [7]:
print(f"Missing cells (%): {df.isna().sum().sum()/(df.shape[0]*df.shape[1])*100}")

Missing cells (%): 2.5815625


### Feature correlation

In [8]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Select only numeric columns to generate correlation plot
numeric_cols = df.select_dtypes(include=np.number).columns

df_aux = df[numeric_cols]
# Compute the correlation matrix
corr = df_aux.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

filename = 'corr.png'

plt.tight_layout()
plt.savefig(filename)
plt.close()

artifact = wandb.Artifact(
    name=filename, 
    type="report", 
    description="Correlation between data features."
)
artifact.add_file(filename)
run.log_artifact(artifact)

<Artifact corr.png>

### Price range

In [9]:
# Plot price histogram
filename = 'price_hist.png'

sns.histplot(df['price'])
plt.savefig(filename)
plt.close()

artifact = wandb.Artifact(
    name=filename, 
    type="report", 
    description="Price histogram."
)
artifact.add_file(filename)
run.log_artifact(artifact)

<Artifact price_hist.png>

In [10]:
# Drop outliers
min_price = 10
max_price = 350
idx = df['price'].between(min_price, max_price)
df = df[idx].copy()

In [11]:
# Plot price histogram
filename = 'price_hist_range.png'

sns.histplot(df['price'])
plt.savefig(filename)
plt.close()

artifact = wandb.Artifact(
    name=filename, 
    type="report", 
    description="Price histogram."
)
artifact.add_file(filename)
run.log_artifact(artifact)

<Artifact price_hist_range.png>

In [12]:
df.isna().sum()

id                                   0
name                                 7
host_id                              0
host_name                            8
neighbourhood_group                  0
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
last_review                       3758
reviews_per_month                 3758
calculated_host_listings_count       0
availability_365                     0
dtype: int64

In [13]:
print(f"Missing cells (%): {df.isna().sum().sum()/(df.shape[0]*df.shape[1])*100}")

Missing cells (%): 2.47717225409189


### Fix feature data type

In [14]:
# Convert last_review to datetime
df['last_review'] = pd.to_datetime(df['last_review'])

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19001 entries, 0 to 19999
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              19001 non-null  int64         
 1   name                            18994 non-null  object        
 2   host_id                         19001 non-null  int64         
 3   host_name                       18993 non-null  object        
 4   neighbourhood_group             19001 non-null  object        
 5   neighbourhood                   19001 non-null  object        
 6   latitude                        19001 non-null  float64       
 7   longitude                       19001 non-null  float64       
 8   room_type                       19001 non-null  object        
 9   price                           19001 non-null  int64         
 10  minimum_nights                  19001 non-null  int64         
 11  number_

### Plot hist and missing value

In [18]:
filename = 'nan.png'
plt.figure(10)

# Check for NaN values
missing_values = df.isnull()

# Plot a heatmap of NaN values
plt.figure(figsize=(8, 5))
sns.heatmap(missing_values, 
            cbar=False,     # remove color bar
            cmap='Purples', # color map (choose any you like)
            yticklabels=False) # hide row labels if you want
plt.title("Missing Values for Each Feature")
plt.xlabel("Features")
plt.ylabel("")

plt.tight_layout()
plt.savefig(filename)
plt.close()

artifact = wandb.Artifact(
    name=filename, 
    type="report", 
    description="Availability histogram."
)
artifact.add_file(filename)
run.log_artifact(artifact)

UsageError: Run (0uaqzso2) is finished. The call to `log_artifact` will be ignored. Please make sure that you are using an active run.

In [16]:
filename = 'price_hist_range.png'
plt.figure(1)
sns.histplot(df['price'])
plt.tight_layout()
plt.savefig(filename)
plt.close()

artifact = wandb.Artifact(
    name=filename, 
    type="report", 
    description="Price histogram after removing outliers."
)
artifact.add_file(filename)
run.log_artifact(artifact)

<Artifact price_hist_range.png>

In [19]:
run.finish()