In [None]:
import kagglehub
from dotenv import load_dotenv, set_key
# Download latest version
path = kagglehub.dataset_download("canggih/indonesian-food-recipes")
load_dotenv()
set_key(".env", "KAGGLE_PATH", path)
print("Path to dataset files:", path)

In [None]:
import pandas as pd
chickens = pd.read_csv(f"{path}/dataset-ayam.csv")
chickens

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
ax = plt.axes()
sns.heatmap(chickens.isna().transpose(), cbar=False, ax=ax)

In [None]:
plt.xlabel("Columns")
plt.ylabel("Missing Values")
plt.show()

In [None]:
import numpy as np
chickens["missing_ingredients"] = np.where(chickens["Ingredients"].isna(), 1, 0)
chickens["missing_steps"] = np.where(chickens["Steps"].isna(), 1, 0)

In [None]:
columns_of_interest = ["Loves", "missing_ingredients", "missing_steps"]

correlation_matrix = chickens[columns_of_interest].corr(method="spearman")
sns.set_theme(style="white")
plt.figure(figsize=(8, 6))

heatmap = sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar_kws={"label": "Spearman Correlation"})
heatmap.set_title("Correlation Heatmap")
plt.show()

In [None]:
# New data frame without empty ingredients and steps
chickens_clean_missing = chickens[(
    - chickens["Ingredients"].isna() &
    - chickens["Steps"].isna()
)]
chickens_clean_missing

In [None]:
chickens_clean_missing["Title"].value_counts().reset_index().sort_values("count", ascending=False)

In [None]:
chickens_clean_missing

In [None]:
chickens_clean_missing[
    chickens_clean_missing["Title"] == "Soto Ayam"
]

In [None]:
chickens_title = chickens_clean_missing.copy()
chickens_title["Title"] = (chickens_title["Title"]
                           .str.title()
                           .str.replace(r"^[^a-zA-Z]*", "", regex=True)
                           )

In [None]:
title_count = chickens_title["Title"].value_counts().reset_index().sort_values("count", ascending=False)
chickens_title_dupes = title_count[
    title_count["count"] > 1
]
chickens_title_dupes

In [None]:
chickens_unique = chickens_title[
    ~chickens_title["Title"].isin(chickens_title_dupes["Title"])
]
chickens_dupe = chickens_title[
    chickens_title["Title"].isin(chickens_title_dupes["Title"])
]
max_chickens_dupe_loves = chickens_dupe.loc[
    chickens_dupe.groupby("Title")["Loves"].idxmax()
]
chickens_distinct = pd.concat([
    max_chickens_dupe_loves,
    chickens_unique
]).reset_index(drop=True)
chickens_distinct

In [None]:
(
    chickens_distinct
    .drop(["missing_ingredients", "missing_steps"], axis=1)
    .to_csv("../csv/cleaned/chickens_cleaned.csv", index=False)
)