# Exploratory Data Analysis – Coffee Database

En este notebook exploraremos el dataset `coffee_db.parquet` para entender:

1. Tendencia global del consumo de café (1990–2019).  
2. Evolución de los 10 países principales.  
3. Distribución por tipo de café.  
4. Países con mayor tasa de crecimiento (CAGR).  

In [None]:
import pandas as pd
import numpy as nps
import matplotlib.pyplot as pltss
import seaborn as sns
from matplotlib import patheffects as pe  # para borde negro en textos

# Estilo global
sns.set_theme(style="whitegrid", context="talk")
plt.rcParams.update({
    "figure.figsize": (22, 9),
    "figure.dpi": 140,          # más nitidez
    "axes.titlesize": 20,
    "axes.labelsize": 14,
    "axes.grid": True,
    "grid.alpha": 0.25
})

def fmt_millions(n: float) -> str:
    """Formato latino: M (10^6), MM (10^9), B (10^12)."""
    if pd.isna(n): 
        return ""
    if abs(n) >= 1e12:
        return f"{n/1e12:.1f}B"
    elif abs(n) >= 1e9:
        return f"{n/1e9:.1f}mM"
    else:
        return f"{n/1e6:.0f}M"

def text_label(ax, x, y, s, fontsize=10, offset=0.0):
    """Etiqueta legible con fondo blanco y borde negro fino."""
    txt = ax.text(
        x, y + offset, s,
        ha="center", va="bottom", fontsize=fontsize, color="black",
        bbox=dict(boxstyle="round,pad=0.2", fc="white", ec="none", alpha=0.9)
    )
    # Borde negro sutil
    txt.set_path_effects([pe.Stroke(linewidth=1.5, foreground="black"), pe.Normal()])
    return txt



## 1. Carga y transformación de datos

Leemos **`coffee_db.parquet`** y lo pasamos de formato ancho (`1990/91, 1991/92,...`) a largo.

In [None]:
# Leer directamente coffee_db.parquet
df = pd.read_parquet("coffee_db.parquet")

# Transformar a formato largo
fixed = ["Country", "Coffee type", "Total_domestic_consumption"]
year_cols = [c for c in df.columns if "/" in c]

df_long = df.melt(id_vars=fixed, value_vars=year_cols,
                  var_name="Year", value_name="Consumption")
df_long["Year"] = df_long["Year"].str.split("/").str[0].astype(int)
df_long.rename(columns={
    "Country":"Pais", "Coffee type":"Tipo_Cafe",
    "Total_domestic_consumption":"Consumo_Total_Original"
}, inplace=True)
df_long = df_long.sort_values(["Pais","Tipo_Cafe","Year"]).reset_index(drop=True)

df_long.head()


In [None]:
df.head()

In [None]:
df.columns

In [None]:
df['Country'].unique()

In [None]:
df['Coffee type'].unique()

## 2. Funciones auxiliares para gráficos

Para mantener el código limpio, definimos helpers que agregan etiquetas automáticamente.


In [None]:
def label_line_last(ax, x_last, y_last, text:str):
    ax.text(x_last, y_last, f"  {text}", va="center", ha="left",
            fontsize=10, weight="bold")

def label_line_every_k(ax, x_vals, y_vals, k:int=5, fontsize:int=9):
    for i, (xv, yv) in enumerate(zip(x_vals, y_vals)):
        if i % k == 0:
            ax.text(xv, yv, fmt_millions(yv), ha="center", va="bottom", fontsize=fontsize)

def label_bars(ax, fontsize:int=10, offset:int=5):
    for p in ax.patches:
        y = p.get_height()
        if pd.isna(y): continue
        x = p.get_x() + p.get_width()/2
        ax.annotate(fmt_millions(y), (x, y), ha="center", va="bottom",
                    xytext=(0, offset), textcoords="offset points",
                    fontsize=fontsize, weight="bold")


## 3. Tendencia global del consumo de café


In [None]:
# === Celda 3: Tendencia global del consumo de café (con etiquetas en cada punto) ===
global_ts = df_long.groupby("Year", as_index=False)["Consumption"].sum()

fig, ax = plt.subplots()
ax.plot(global_ts["Year"], global_ts["Consumption"], marker="o", linewidth=2.5, color="steelblue")

ax.set_title("Consumo mundial de café (1990–2019)")
ax.set_xlabel("Año")
ax.set_ylabel("Consumo (tazas)")

# Etiquetar cada punto
for i, row in global_ts.iterrows():
    ax.text(row["Year"], row["Consumption"], fmt_millions(row["Consumption"]),
            ha="center", va="bottom", fontsize=9, weight="bold")

# Mostrar todos los años en el eje X
ax.set_xticks(global_ts["Year"])
ax.tick_params(axis="x", rotation=75)

plt.tight_layout()
plt.show()


## 4. Evolución del consumo en los 10 países principales


In [None]:
# === BRASIL: etiquetas en TODOS los puntos, limpio ===
br = (df_long[df_long["Pais"] == "Brazil"]
      .groupby("Year", as_index=False)["Consumption"].sum())

fig, ax = plt.subplots(figsize=(26, 8), dpi=160)
ax.plot(br["Year"], br["Consumption"], marker="o", linewidth=3, color="#1f77b4", zorder=2)

ax.set_title("Consumo de café – Brazil (1990–2019)")
ax.set_xlabel("Año"); ax.set_ylabel("Consumo (tazas)")
ax.set_xticks(br["Year"]); ax.tick_params(axis="x", rotation=60)

# Etiquetas en cada punto con fondo blanco y borde (muy legible)
ymin, ymax = ax.get_ylim(); yrange = ymax - ymin
for i, (xv, yv) in enumerate(zip(br["Year"], br["Consumption"])):
    off = (0.015 if (i % 2 == 0) else -0.010) * yrange
    text_label(ax, xv, yv, fmt_millions(yv), fontsize=11, offset=off)

ax.margins(x=0.01, y=0.15)
plt.tight_layout()
plt.show()


In [None]:
# === RESTO DE PAÍSES (Top 9 sin Brazil, SIN etiquetas de datos) ===
otros = df_long[df_long["Pais"] != "Brazil"].copy()
top9 = (otros.groupby("Pais")["Consumption"]
        .sum().sort_values(ascending=False).head(9).index)
df_otros = otros[otros["Pais"].isin(top9)]

fig, ax = plt.subplots(figsize=(26, 12), dpi=150)

palette = sns.color_palette("tab10", n_colors=len(top9))
for (pais, sub), color in zip(df_otros.groupby("Pais"), palette):
    sub = sub.sort_values("Year").reset_index(drop=True)
    ax.plot(sub["Year"], sub["Consumption"], marker="o", linewidth=2.2,
            color=color, alpha=0.95, label=pais)

ax.set_title("Consumo de café – Resto de países (Top 9 sin Brazil)")
ax.set_xlabel("Año"); ax.set_ylabel("Consumo (tazas)")

# Todos los años en eje X
years_sorted = sorted(df_otros["Year"].unique())
ax.set_xticks(years_sorted)
ax.tick_params(axis="x", rotation=60, labelsize=10)

# Leyenda externa clara
ax.legend(title="País", bbox_to_anchor=(1.02, 1), loc="upper left", frameon=False)

ax.margins(x=0.01, y=0.05)
plt.tight_layout()
plt.show()



## 5. Consumo acumulado global por tipo de café


In [None]:
by_type = (df_long.groupby("Tipo_Cafe", as_index=False)["Consumption"]
           .sum().sort_values("Consumption", ascending=False))

fig, ax = plt.subplots()
ax.bar(by_type["Tipo_Cafe"], by_type["Consumption"], color=sns.color_palette("Set2", n_colors=len(by_type)))
ax.set_title("Consumo acumulado global por tipo de café (1990–2019)")
ax.set_xlabel("Tipo de café"); ax.set_ylabel("Consumo total (tazas)")
label_bars(ax)
plt.show()


## 6. Países con mayor CAGR (tasa de crecimiento anual compuesto)


In [None]:
def compute_cagr(df_long, min_years:int=15):
    g = (df_long.sort_values("Year")
         .groupby("Pais")
         .agg(start_year=("Year","min"),
              end_year=("Year","max"),
              start_val=("Consumption","first"),
              end_val=("Consumption","last"))
         .reset_index())
    g = g[g["start_val"] > 0]
    g["years"] = g["end_year"] - g["start_year"]
    g = g[g["years"] >= min_years]
    g["CAGR"] = (g["end_val"] / g["start_val"])**(1/g["years"]) - 1
    return g.sort_values("CAGR", ascending=False)

cagr_df = compute_cagr(df_long, min_years=15).head(10)

In [None]:
# cagr_df ya calculado; valores en porcentaje:
vals = (cagr_df["CAGR"] * 100).to_numpy()

fig, ax = plt.subplots(figsize=(10, 7))
bars = ax.barh(cagr_df["Pais"], vals, color=sns.color_palette("viridis", len(vals)))

ax.set_title("Top 10 países por Tasa de Crecimiento (CAGR, 1990–2019)")
ax.set_xlabel("Crecimiento anual (%)"); ax.set_ylabel("País")

# Asegura espacio a la derecha para la etiqueta
maxv = vals.max()
ax.set_xlim(0, maxv * 1.12)  # 12% extra

# Padding proporcional al rango del eje
xpad = (ax.get_xlim()[1] - ax.get_xlim()[0]) * 0.01

# Etiquetas usando la geometría real de cada barra
for bar, v in zip(bars, vals):
    w = bar.get_width()              # largo de la barra
    y = bar.get_y() + bar.get_height()/2  # centro vertical de la barra
    ax.text(w + xpad, y, f"{v:.2f}%", va="center", ha="left", fontsize=10, color="black")

plt.tight_layout()
plt.show()

In [None]:
#!pip install yfinance pandas_datareader

In [None]:
#!pip install yfinance

# Extraer lo datos del precio del café

In [None]:
import yfinance as yf

coffee = yf.download("KC=F", start="1990-01-01", end="2020-12-31", interval="1mo")
print(coffee.head())

## Paso 1: preparar precios anuales

In [None]:
# si "Close" es un DataFrame multi-columna, agarramos la primera col
close_series = coffee["Close"].iloc[:,0] if isinstance(coffee["Close"], pd.DataFrame) else coffee["Close"]

# promedio anual
price_annual = (
    close_series
    .resample("Y").mean()
    .to_frame(name="Price_USc_lb")
    .reset_index()
)

price_annual["Year"] = price_annual["Date"].dt.year
price_annual = price_annual[["Year", "Price_USc_lb"]]

print(price_annual.head())


## Paso 2: unir con consumo global

In [None]:
global_ts = df_long.groupby("Year", as_index=False)["Consumption"].sum()
cons_price = global_ts.merge(price_annual, on="Year", how="inner")

print(cons_price.head())


In [None]:
# === Precio promedio anual del café (KC=F) con etiquetas más visibles ===
close_series = coffee["Close"].iloc[:, 0] if isinstance(coffee["Close"], pd.DataFrame) else coffee["Close"]

price_annual = (
    close_series.resample("Y").mean()
    .to_frame(name="Price_USc_lb")
    .reset_index()
)
price_annual["Year"] = price_annual["Date"].dt.year
price_annual = price_annual[["Year", "Price_USc_lb"]]

fig, ax = plt.subplots(figsize=(14,6))
ax.plot(price_annual["Year"], price_annual["Price_USc_lb"],
        marker="o", linewidth=2.5, color="firebrick")

ax.set_title("Precio promedio anual del café (KC=F)", fontsize=16)
ax.set_xlabel("Año", fontsize=12)
ax.set_ylabel("Precio futuros (¢/lb)", fontsize=12)

# Etiquetas más grandes y visibles
for _, r in price_annual.iterrows():
    ax.text(r["Year"], r["Price_USc_lb"], f"{r['Price_USc_lb']:.0f}¢",
            ha="center", va="bottom", fontsize=11, weight="bold", color="black")

ax.set_xticks(price_annual["Year"])
ax.tick_params(axis="x", rotation=75)

plt.tight_layout()
plt.show()


### preparar datos para Prophet

In [None]:
#!pip install prophet

In [None]:
from prophet import Prophet

# --- 1) Preparar dataset global
global_ts = df_long.groupby("Year", as_index=False)["Consumption"].sum()

# unir con precio (ya tienes price_annual en USD/lb o ¢/lb)
close_series = coffee["Close"].iloc[:, 0] if isinstance(coffee["Close"], pd.DataFrame) else coffee["Close"]
price_annual = (
    close_series.resample("Y").mean().to_frame("Price_USD_lb").reset_index()
)
price_annual["Year"] = price_annual["Date"].dt.year
price_annual = price_annual[["Year", "Price_USD_lb"]]

# merge
cons_price = global_ts.merge(price_annual, on="Year", how="inner")

# Prophet requiere fechas reales (no solo año)
cons_price["ds"] = pd.to_datetime(cons_price["Year"].astype(str) + "-01-01")
cons_price["y"] = cons_price["Consumption"]

# dataset final para Prophet
df_prophet = cons_price[["ds", "y", "Price_USD_lb"]]


In [None]:
# Crear modelo Prophet y añadir el regresor (precio)
m = Prophet(yearly_seasonality=False, weekly_seasonality=False, daily_seasonality=False)
m.add_regressor("Price_USD_lb")

# Entrenar
m.fit(df_prophet)

In [None]:
# Crear fechas futuras hasta 2025
future = pd.DataFrame({"ds": pd.date_range("2020-01-01", "2025-01-01", freq="YE")})

# Usar último precio conocido como proxy
last_price = df_prophet["Price_USD_lb"].iloc[-1]
future["Price_USD_lb"] = last_price

# Concatenar histórico + futuro
future = pd.concat([df_prophet[["ds", "Price_USD_lb"]], future], ignore_index=True)


In [None]:
forecast = m.predict(future)

In [None]:
import matplotlib.dates as mdates

# === Predicción de consumo global con etiquetas bien posicionadas ===
fig, ax = plt.subplots(figsize=(14, 6))

# Histórico
ax.plot(df_prophet["ds"], df_prophet["y"]/1e9, marker="o", linewidth=2.5,
        color="steelblue", label="Consumo histórico")

# Forecast
ax.plot(forecast["ds"], forecast["yhat"]/1e9, color="firebrick",
        linewidth=2.5, label="Predicción")
ax.fill_between(forecast["ds"],
                forecast["yhat_lower"]/1e9,
                forecast["yhat_upper"]/1e9,
                color="firebrick", alpha=0.2, label="Intervalo")

ax.set_title("Predicción de Consumo Global de Café hasta 2025", fontsize=16)
ax.set_xlabel("Año", fontsize=12)
ax.set_ylabel("Consumo (miles de millones de tazas)", fontsize=12)

# 👉 Etiquetas usando la MISMA escala del eje X (datetime), no .year
# Si quieres etiquetar todos, deja el for tal cual; si no, etiqueta desde 2015:
for _, r in forecast.iterrows():
    ax.text(r["ds"], r["yhat"]/1e9,
            f"{r['yhat']/1e9:.2f}",
            ha="center", va="bottom", fontsize=9)

# Eje X anual y bien formateado
ax.xaxis.set_major_locator(mdates.YearLocator(base=1))   # cada año
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
fig.autofmt_xdate(rotation=75)

ax.legend()
plt.tight_layout()
plt.show()


In [None]:
#!pip install statsmodels

In [None]:
import matplotlib.pyplot as plt

# Serie histórica
years_hist = ts.index.year
vals_hist = ts.values

# Predicción
years_pred = pred.index.year
vals_pred = pred.values

# Intervalo
ci_lower = ci.iloc[:, 0]
ci_upper = ci.iloc[:, 1]

fig, ax = plt.subplots(figsize=(14,6))

# Histórico
ax.plot(years_hist, vals_hist, marker="o", color="steelblue", linewidth=2, label="Histórico")

# Predicción
ax.plot(years_pred, vals_pred, marker="o", color="firebrick", linewidth=2, label="Predicción")

# Intervalo de confianza
ax.fill_between(years_pred, ci_lower, ci_upper, color="firebrick", alpha=0.2, label="Intervalo")

# Etiquetas de datos (histórico)
for x, y in zip(years_hist, vals_hist):
    ax.text(x, y, f"{y:.0f}", ha="center", va="bottom", fontsize=8, color="black")

# Etiquetas de datos (predicción)
for x, y in zip(years_pred, vals_pred):
    ax.text(x, y, f"{y:.0f}", ha="center", va="bottom", fontsize=8, color="black")

# Configuración de ejes
all_years = list(years_hist) + list(years_pred)
ax.set_xticks(all_years)
ax.set_xticklabels(all_years, rotation=45)

ax.set_title("Predicción precio café (ARIMA)", fontsize=16)
ax.set_xlabel("Año", fontsize=12)
ax.set_ylabel("Precio", fontsize=12)
ax.legend()

plt.tight_layout()
plt.show()

