# Data Exploration for Exoplanet Detection
This notebook explores the raw light curve data and visualizes key patterns.

In [None]:
import sys
import os
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from utils import plot_light_curve

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

## Load Raw Data

In [None]:
# Load data
data_path = '../data/raw/exoTest.csv'
df = pd.read_csv(data_path)

print(f"Data shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()[:10]}...")
print(f"\nLabel distribution:")
print(df['LABEL'].value_counts())

## Visualize Sample Light Curves

In [None]:
# Get flux columns
flux_columns = [col for col in df.columns if col.startswith('FLUX')]
print(f"Number of flux measurements: {len(flux_columns)}")

# Plot examples from each class
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Planet examples (Label = 2)
planet_samples = df[df['LABEL'] == 2].sample(2)
for i, (idx, row) in enumerate(planet_samples.iterrows()):
    flux = row[flux_columns].values
    axes[0, i].plot(flux, 'b-', linewidth=0.5)
    axes[0, i].set_title(f'Planet Light Curve (Index: {idx})')
    axes[0, i].set_xlabel('Time')
    axes[0, i].set_ylabel('Flux')
    axes[0, i].grid(True, alpha=0.3)

# Non-planet examples (Label = 1)
no_planet_samples = df[df['LABEL'] == 1].sample(2)
for i, (idx, row) in enumerate(no_planet_samples.iterrows()):
    flux = row[flux_columns].values
    axes[1, i].plot(flux, 'r-', linewidth=0.5)
    axes[1, i].set_title(f'No Planet Light Curve (Index: {idx})')
    axes[1, i].set_xlabel('Time')
    axes[1, i].set_ylabel('Flux')
    axes[1, i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Statistical Analysis

In [None]:
# Calculate basic statistics for each class
stats_by_class = {}

for label in df['LABEL'].unique():
    class_data = df[df['LABEL'] == label][flux_columns]
    
    stats_by_class[label] = {
        'mean': class_data.mean().mean(),
        'std': class_data.std().mean(),
        'min': class_data.min().min(),
        'max': class_data.max().max(),
        'median': class_data.median().median()
    }

stats_df = pd.DataFrame(stats_by_class).T
stats_df.index = ['No Planet', 'Planet']
print("\nStatistics by Class:")
print(stats_df)

## Flux Distribution Analysis

In [None]:
# Plot flux distributions
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Sample data for visualization
sample_size = 100
planet_fluxes = df[df['LABEL'] == 2][flux_columns].sample(sample_size).values.flatten()
no_planet_fluxes = df[df['LABEL'] == 1][flux_columns].sample(sample_size).values.flatten()

# Histogram
axes[0].hist(planet_fluxes, bins=50, alpha=0.5, label='Planet', color='blue')
axes[0].hist(no_planet_fluxes, bins=50, alpha=0.5, label='No Planet', color='red')
axes[0].set_xlabel('Flux Value')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Flux Distribution by Class')
axes[0].legend()

# Box plot
axes[1].boxplot([planet_fluxes, no_planet_fluxes], labels=['Planet', 'No Planet'])
axes[1].set_ylabel('Flux Value')
axes[1].set_title('Flux Distribution Box Plot')

plt.tight_layout()
plt.show()

## Transit Detection Visualization

In [None]:
# Find and visualize potential transits
planet_sample = df[df['LABEL'] == 2].iloc[0]
flux = planet_sample[flux_columns].values

# Detect dips (potential transits)
median_flux = np.median(flux)
std_flux = np.std(flux)
threshold = median_flux - 2 * std_flux
transit_mask = flux < threshold

# Plot
plt.figure(figsize=(15, 5))
time = np.arange(len(flux))
plt.plot(time, flux, 'b-', linewidth=0.5, label='Flux')
plt.axhline(y=median_flux, color='g', linestyle='--', label='Median')
plt.axhline(y=threshold, color='r', linestyle='--', label='Transit Threshold')
plt.scatter(time[transit_mask], flux[transit_mask], color='red', s=10, label='Potential Transit')
plt.xlabel('Time')
plt.ylabel('Flux')
plt.title('Transit Detection in Planet Light Curve')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"Transit points: {np.sum(transit_mask)} / {len(flux)} ({100*np.sum(transit_mask)/len(flux):.2f}%)")
print(f"Estimated transit depth: {median_flux - np.mean(flux[transit_mask]):.2f}")