## Density study

In [None]:
df = clean_nom_francais.copy()
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df["year"] = df["date"].dt.year

# --- Compute total birds per transect per year ---
transect_year_counts = (
    df.groupby(["year", "Nom transect"])["TOT_AV_V"]
      .sum()
      .reset_index()
      .rename(columns={"TOT_AV_V": "Bird_count"})
)

# --- Normalize counts per year ---
transect_year_counts["Normalized_density"] = transect_year_counts.groupby("year")["Bird_count"].transform(
    lambda x: x / x.max()
)
# --- color ---
transects = sorted(transect_year_counts["Nom transect"].unique())
colors = plt.cm.tab20(np.linspace(0, 1, len(transects)))  # up to 20 distinct colors
color_map = dict(zip(transects, colors))

# --- Plot normalized density distributions per year ---
years = sorted(transect_year_counts["year"].dropna().unique())

for yr in years:
    subset = transect_year_counts[transect_year_counts["year"] == yr]
    plt.figure(figsize=(10, 5))
    for _, row in subset.iterrows():
        plt.bar(
            row["Nom transect"], 
            row["Normalized_density"], 
            color=color_map[row["Nom transect"]],
            edgecolor="black",
            linewidth=0.5
        )
    plt.title(f"Normalized bird observation density per transect ‚Äî {yr}")
    plt.xlabel("Transect")
    plt.ylabel("Normalized density (0‚Äì1)")
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

In [None]:
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df["year"] = df["date"].dt.year

# --- Compute total birds per transect per year ---
transect_year_counts = (
    df.groupby(["year", "Nom transect"])["TOT_AV_V"]
      .sum()
      .reset_index()
      .rename(columns={"TOT_AV_V": "Bird_count"})
)

# --- Normalize globally (using the all-time maximum count) ---
global_max = transect_year_counts["Bird_count"].max()
transect_year_counts["Normalized_density"] = transect_year_counts["Bird_count"] / global_max

# --- Assign a consistent color per transect ---
transects = sorted(transect_year_counts["Nom transect"].unique())
colors = plt.cm.tab20(np.linspace(0, 1, len(transects)))  # up to 20 distinct colors
color_map = dict(zip(transects, colors))

# --- Plot evolution by year ---
years = sorted(transect_year_counts["year"].dropna().unique())

for yr in years:
    subset = transect_year_counts[transect_year_counts["year"] == yr].sort_values("Normalized_density", ascending=False)
    
    plt.figure(figsize=(10, 5))
    for _, row in subset.iterrows():
        plt.bar(
            row["Nom transect"], 
            row["Normalized_density"], 
            color=color_map[row["Nom transect"]],
            edgecolor="black",
            linewidth=0.5
        )
    
    plt.title(f"Bird density per transect ‚Äî normalized (global max=1) ‚Äî {yr}")
    plt.xlabel("Transect")
    plt.ylabel("Normalized density (0‚Äì1)")
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

## Bootstrap Method

In [None]:
# --- Parameters ---
B = 1000  # number of bootstrap samples
alpha = 0.05  # for 95% CI

# --- Helper: bootstrap function ---
def bootstrap_ci(data, func=np.mean, B=1000, alpha=0.05):
    """
    Compute bootstrap confidence interval for a given statistic (func).
    """
    n = len(data)
    if n == 0:
        return np.nan, np.nan, np.nan  # handle empty groups
    
    # Original estimate
    theta_hat = func(data)
    
    # Bootstrap resamples
    boot_estimates = []
    for _ in range(B):
        sample = np.random.choice(data, size=n, replace=True)
        boot_estimates.append(func(sample))
    boot_estimates = np.array(boot_estimates)
    
    # Quantiles of bootstrap distribution
    q_low = np.quantile(boot_estimates, alpha / 2)
    q_high = np.quantile(boot_estimates, 1 - alpha / 2)
    
    # Reflected confidence interval (percentile method)
    ci_low = 2 * theta_hat - q_high
    ci_high = 2 * theta_hat - q_low
    
    return theta_hat, ci_low, ci_high



In [None]:
# --- Compute bootstrap estimates per transect-year ---
bootstrap_results = []
for (year, transect), group in df.groupby(["year", "Nom transect"]):
    theta_hat, ci_low, ci_high = bootstrap_ci(group["TOT_AV_V"].values, func=np.sum, B=1000, alpha=0.05)
    bootstrap_results.append({
        "year": year,
        "transect": transect,
        "total_birds": theta_hat,
        "ci_low": ci_low,
        "ci_high": ci_high
    })

bootstrap_df = pd.DataFrame(bootstrap_results)

# --- Normalize by global maximum total (for comparable densities) ---
global_max = bootstrap_df["total_birds"].max()
bootstrap_df["density_norm"] = bootstrap_df["total_birds"] / global_max
bootstrap_df["ci_low_norm"] = bootstrap_df["ci_low"] / global_max
bootstrap_df["ci_high_norm"] = bootstrap_df["ci_high"] / global_max

display(bootstrap_df.head())

In [None]:
years = sorted(bootstrap_df["year"].unique())

for yr in years:
    subset = bootstrap_df[bootstrap_df["year"] == yr].sort_values("density_norm", ascending=False)
    
    plt.figure(figsize=(10, 5))
    plt.bar(
        subset["transect"],
        subset["density_norm"],
        yerr=[subset["density_norm"] - subset["ci_low_norm"], subset["ci_high_norm"] - subset["density_norm"]],
        capsize=3
    )
    plt.title(f"Bootstrap-estimated normalized bird density per transect ‚Äî {yr}")
    plt.xlabel("Transect")
    plt.ylabel("Normalized density (0‚Äì1)")
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()


In [None]:
# --- Identify top 5 transects by mean normalized density across all years ---
top_transects = (
    bootstrap_df.groupby("transect")["density_norm"]
    .mean()
    .sort_values(ascending=False)
    .head(5)
    .index
)

print("Top 5 transects with highest mean normalized density:")
print(top_transects.tolist())

# --- Filter data for only those transects ---
top_df = bootstrap_df[bootstrap_df["transect"].isin(top_transects)]

# --- Plot temporal evolution for top 5 transects ---
plt.figure(figsize=(10, 6))
for transect, group in top_df.groupby("transect"):
    plt.plot(group["year"], group["density_norm"], marker="o", label=transect)

plt.title("Temporal evolution of normalized bird density ‚Äî Top 5 transects (after bootstrapping)")
plt.xlabel("Year")
plt.ylabel("Normalized density (0‚Äì1)")
plt.legend(title="Transect", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.grid(True, linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()

### Shannon and simpson diversity index

In [None]:
# --- Step 1: Aggregate counts per species per transect per year ---
species_counts = (
    df.groupby(["year", "Nom transect", "ESPECE"])["TOT_AV_V"]
    .sum()
    .reset_index()
    .rename(columns={"TOT_AV_V": "count"})
)

# --- Step 2: Define diversity index functions ---
def shannon_index(values):
    values = np.array(values)
    values = values[values > 0]
    proportions = values / values.sum()
    return -np.sum(proportions * np.log(proportions + 1e-12))

def simpson_index(values):
    values = np.array(values)
    values = values[values > 0]
    proportions = values / values.sum()
    return 1 - np.sum(proportions ** 2)


# ===========================
# 3Ô∏è‚É£ After bootstrapping ‚Äî using bootstrap_df
# ===========================
boot_diversity = []
for year, group in bootstrap_df.groupby("year"):
    vals = group["density_norm"].values
    sh_mean, sh_low, sh_high = bootstrap_ci(vals, shannon_index, B=1000)
    si_mean, si_low, si_high = bootstrap_ci(vals, simpson_index, B=1000)
    boot_diversity.append({
        "year": year,
        "Shannon_boot": sh_mean,
        "Shannon_low": sh_low,
        "Shannon_high": sh_high,
        "Simpson_boot": si_mean,
        "Simpson_low": si_low,
        "Simpson_high": si_high
    })
boot_diversity = pd.DataFrame(boot_diversity)

# --- Plot Shannon index comparison ---
plt.figure(figsize=(9,5))
plt.plot(boot_diversity["year"], boot_diversity["Shannon_boot"], "o-", label="Bootstrap Shannon (mean)")
plt.fill_between(boot_diversity["year"], boot_diversity["Shannon_low"], boot_diversity["Shannon_high"], alpha=0.3, label="95% CI")
plt.title("Shannon diversity of transect densities ‚Äî raw vs bootstrap (with CI)")
plt.xlabel("Year")
plt.ylabel("Shannon index (spatial diversity)")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()

# --- Plot Simpson index comparison ---
plt.figure(figsize=(9,5))
plt.plot(boot_diversity["year"], boot_diversity["Simpson_boot"], "s-", label="Bootstrap Simpson (mean)")
plt.fill_between(boot_diversity["year"], boot_diversity["Simpson_low"], boot_diversity["Simpson_high"], alpha=0.3, label="95% CI")
plt.title("Simpson diversity of transect densities ‚Äî raw vs bootstrap (with CI)")
plt.xlabel("Year")
plt.ylabel("Simpson index (spatial diversity)")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()



## Linear regression of density + P-value

In [None]:
# Compute mean density per year
mean_density = (
    transect_year_counts.groupby("year")["density_norm"]
    .mean()
    .reset_index()
    .sort_values("year")
)

# Define variables for regression
X = sm.add_constant(mean_density["year"])  # add intercept
y = mean_density["density_norm"]

# Fit the OLS (Ordinary Least Squares) regression model
model = sm.OLS(y, X).fit()

# Print model summary
print(model.summary())

# Extract the p-value for the year coefficient
p_value = model.pvalues["year"]
slope = model.params["year"]
r_squared = model.rsquared

print(f"Slope: {slope:.6f}")
print(f"p-value (year effect): {p_value:.6f}")
print(f"R¬≤: {r_squared:.4f}")

# Predict fitted values and confidence interval
predictions = model.get_prediction(X)
pred_summary = predictions.summary_frame(alpha=0.05)

# Plot data + regression line + 95% confidence interval
plt.figure(figsize=(9, 6))
plt.scatter(mean_density["year"], mean_density["density_norm"], label="Observed mean densities")
plt.plot(mean_density["year"], pred_summary["mean"], label="Fitted regression line", linewidth=2)
plt.fill_between(
    mean_density["year"],
    pred_summary["mean_ci_lower"],
    pred_summary["mean_ci_upper"],
    alpha=0.3,
    label="95% confidence interval"
)
plt.title("Linear regression of mean normalized density over years")
plt.xlabel("Year")
plt.ylabel("Mean normalized density")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()

# Summary of Density indicator (what we did)

## ü™∂ Density Indicator ‚Äî Summary of Computations

### 1. Definition
The density indicator measures the relative abundance of birds observed per transect and year.



### 2. Computation Steps

**(a) Counting per Transect and Year**  
From the cleaned observation dataset (`nom_francais_clean`), total bird counts were aggregated by `(year, transect)` using columns such as `TOT_A`, `TOT_V_sV`, etc.

**(b) Normalization**  
Each transect‚Äôs annual count was normalized by the maximum count observed across all years:

$$
\text{density\_norm}_{i,t} = \frac{\text{count}_{i,t}}{\max(\text{count}_{\text{all years}})}
$$

Densities are thus scaled to the range [0, 1].



### 3. Bootstrap Estimation

A bootstrap resampling method was used to estimate uncertainty:

1. For each year, resample transects with replacement \( B \) times (e.g. \( B = 1000 \)).
2. Compute the mean normalized density for each resample.
3. Obtain 95% confidence intervals from the empirical quantiles of the bootstrap distribution.

$$
\text{CI}_{95\%} = [\hat{\theta}^*_{2.5\%}, \hat{\theta}^*_{97.5\%}]
$$



### 4. Derived Indicators and Visualization

- Annual mean normalized density computed and plotted over time.  
- Per-year density distribution visualized by transect (color-coded).  
- Temporal evolution of normalized densities visualized across transects.  
- Bootstrap mean and confidence intervals plotted for density trends.


### 5. Diversity Indices Applied to Density

**Shannon Diversity Index**
$$
H' = -\sum_i p_i \ln(p_i)
$$

**Simpson Diversity Index**
$$
D = 1 - \sum_i p_i^2
$$

Both indices computed from the normalized density distribution across transects:
- Calculated per year on raw and bootstrapped densities.
- Confidence intervals estimated via bootstrap resampling.
- Temporal evolution of both indices visualized.
- Additional plots generated for the top 5 transects with highest mean density.


### 6. Linear Regression and p-Value Computation

A **linear regression** was fitted using `statsmodels` to assess the presence of a temporal trend in mean normalized density:

$$
\text{density\_norm} = \beta_0 + \beta_1 \times \text{year} + \varepsilon
$$

- The regression was estimated with the **Ordinary Least Squares (OLS)** method.  
- The **slope** (\(\beta_1\)) quantifies the direction and magnitude of the density trend over years.  
- The **p-value** for the `year` coefficient tests the null hypothesis \(H_0: \beta_1 = 0\).  
- A **95% confidence interval** and **R¬≤** statistic were also extracted to evaluate model fit.  
- The fitted regression line and its 95% confidence band were plotted together with observed mean densities.



