# **Week 3: Data Analysis in Astronomy**

In [None]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from astropy.table import Table
import scipy.optimize as opt
from scipy.stats import linregress

# **Linear Regression**

Linear regression is used to determine the relationship between two variables by fitting a straight line to the data.


In this example we fit a straight line to the relation between absolute V-band magnitude (M_V) and the half-light radius (r_half) of Milky Way satellite galaxies.
Because galaxy sizes span orders of magnitude, we take log10(r_half) so the relation becomes approximately linear. Here are the steps we take:
- Load the dwarf galaxy table and extract r_half and M_V,
- Filter out invalid or non-positive radii, then compute log10(r_half),
- Use `scipy.stats.linregress` to fit `M_V = slope * log10(r_half) + intercept`,
- Inspect slope, intercept, correlation coefficient (r) and p-value to assess the fit,
- Plot the data (r on a log x-axis) with the best-fit line and show a residuals panel to check for systematic deviations.
  

In [None]:
# Example 1: Linear Regression - Absolute Magnitude vs Half-light Radius for Milky Way Satellites
# ------------------------------------------------------------------------------------------------

# Read the dwarf galaxy data from the online database
dsph_mw = Table.read('https://raw.githubusercontent.com/apace7/local_volume_database/main/data/dwarf_mw.csv')

# Extract the half-light radius and absolute magnitude columns
r_half = dsph_mw['rhalf_sph_physical']  # Half-light radius (pc)
M_V = dsph_mw['M_V']                    # Absolute V-band magnitude

# Filter valid data (data cleaning)
mask = np.isfinite(r_half) & np.isfinite(M_V) & (r_half > 0)
r = np.array(r_half[mask], dtype=float)
M = np.array(M_V[mask], dtype=float)
log_r = np.log10(r)

# Perform linear regression: M_V = slope * log10(r) + intercept
slope, intercept, r_value, p_value, std_err = linregress(log_r, M)

print(f"slope = {slope:.4f}, intercept = {intercept:.4f}, r = {r_value:.4f}, p = {p_value:.3e}, stderr = {std_err:.4f}")

# Plot data and best-fit line (x-axis shown in log scale)
plt.figure(figsize=(8,6))
plt.scatter(r, M, color='blue', label='Data')
x_fit = np.logspace(log_r.min(), log_r.max(), 10)
y_fit = slope * np.log10(x_fit) + intercept
plt.plot(x_fit, y_fit, color='red', lw=2, label=f'Fit: M_V = {slope:.3f} log10(r) + {intercept:.2f}')
plt.xscale('log')
plt.gca().invert_yaxis()
plt.xlabel("Half-light Radius (pc)")
plt.ylabel("Absolute Magnitude (M$_V$)")
plt.title("MW Satellites: M$_V$ vs r$_{half}$")
plt.legend()
plt.grid(True, which='both', ls='--', alpha=0.4)

# Residuals plot
plt.figure(figsize=(8,3))
resid = M - (slope * log_r + intercept)
plt.axhline(0, color='k', lw=0.8)
plt.scatter(r, resid, color='gray')
plt.xscale('log')
plt.xlabel("Half-light Radius (pc)")
plt.ylabel("Residual (M$_V$)")
plt.title("Residuals of fit")
plt.grid(True, which='both', ls='--', alpha=0.4)

**Class activity: Distance Modulus**


In astronomy, we use the distance modulus equation to relate the apparent magnitude (m) and absolute magnitude (M) of a star to its distance (d) in parsecs:

\begin{equation}
m - M = 5 \log_{10}(d) - 5 \tag{1}
\end{equation}

Rearranging for distance:

\begin{equation}
d = 10^{\frac{m - M + 5}{5}} \tag{2}
\end{equation}


Your tasks:

1. Write a function to compute distances from the provided arrays `apparent_magnitude` and `absolute_magnitude` using the distance modulus formula.
```
absolute_magnitude = np.array([-1.5, 0.0, 1.0, 2.5, 4.0])
apparent_magnitude = np.array([5.2, 7.1, 8.5, 11.0, 12.8])
```
2. Convert the distances to `log10(distance)`; this makes the relation linear.
3. Use `scipy.stats.linregress` to fit a line to `log10(distance)` as a function of `apparent_magnitude`. Which variable is x and which is y?
4. Plot the data points and the best-fit line. Label axes and show the fitted equation on the plot.
5. Compute and plot residuals (data minus model). Are residuals randomly distributed?

Write your code to answer each step.


In [None]:
# @title
# Example 2: Linear Regression - Distance Modulus Relation
# ---------------------------------------------------------
def distance_modulus(m, M):
    return 10 ** ((m - M + 5) / 5)  # Returns distance in parsecs

# Generate synthetic data
absolute_magnitude = np.array([-1.5, 0.0, 1.0, 2.5, 4.0])
apparent_magnitude = np.array([5.2, 7.1, 8.5, 11.0, 12.8])

# Step 1: compute distances
distance = distance_modulus(apparent_magnitude, absolute_magnitude)

# Step 2: convert to log10(distance)
log_distance = np.log10(distance)

# Step 3: linear regression (x = apparent magnitude, y = log10(distance))
slope, intercept, r_value, p_value, std_err = linregress(apparent_magnitude, log_distance)
print(f"slope = {slope:.4f}, intercept = {intercept:.4f}, r = {r_value:.4f}, p = {p_value:.3e}")


# Step 4: plot data and best-fit line
plt.figure(figsize=(8,6))
plt.scatter(apparent_magnitude, log_distance, label='Data')
x_fit = np.linspace(apparent_magnitude.min(), apparent_magnitude.max(), 200)
y_fit = slope * x_fit + intercept
plt.plot(x_fit, y_fit, color='red', label=f'Fit: log10(d) = {slope:.3f} m + {intercept:.3f}')
plt.xlabel("Apparent Magnitude (m)")
plt.ylabel("log10(Distance [pc])")
plt.title("Distance Modulus: log10(d) vs Apparent Magnitude")
plt.legend()
plt.grid(alpha=0.3)

# Step 5: residuals
resid = log_distance - (slope * apparent_magnitude + intercept)
plt.figure(figsize=(8,3))
plt.axhline(0, color='k', lw=0.8)
plt.scatter(apparent_magnitude, resid)
plt.xlabel("Apparent Magnitude (m)")
plt.ylabel("Residual (log10 d)")
plt.title("Residuals of fit")
plt.grid(alpha=0.3)
