![](https://www.googleapis.com/download/storage/v1/b/kaggle-forum-message-attachments/o/inbox%2F761268%2F449ed3cb8bc43f498fcd917733a33064%2FScreenshot%202024-08-04%20at%209.24.19PM.png?generation=1722786886542329&alt=media)

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.core.display import HTML, Markdown
import seaborn as sns
from glob import glob
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

sns.set(style='whitegrid')
comp_dir = "/kaggle/input/ariel-data-challenge-2024"

In [None]:
def render(values, style='section', level=1):
    if style == 'section':
        #display(Markdown(f'<h{level} class="section">{values}</h{level}>'))
        pass
    html = ""
    flag = False
    for value in values:
        if style == 'insight':
            html += f'<ul class="bullet"><li class="insight"> - {value}</li></ul>'
        if style == 'data-insight':
            if ':' in value:
                flag = True
                html += f'<ul class="bullet"><li class="data-insight"><span>{value.split(":")[0]}</span>{value.split(":")[1]}</li></ul>'
            else:
                html += f'<ul class="bullet"><li class="data-insight">{value}</li></ul>'
    
    if flag == True:
        html += "<hr>"
    display(HTML(html))
def css_styling():
    styles = """
    <style>
        .section {
            color:#33A1C9;
            font-size:20px;
            font-weight: bold;
            padding-top:15px;
        }
        .sub_section {
            color:#f1c40f;
            font-size:16px;
            font-weight: bold;
            padding-top:10px;
        }
        .insight {
            color:teal;
            font-size:14px;
            font-weight: bold;
        }
        .data-insight {
            background-color: #f9f9f9;
            border-left: 4px solid #C70039;
            padding: 8px;
            margin-top: 5px;
            font-weight: normal;
        }
        .data-insight span{
            font-weight: bold;
            display: block;
        }
        .bullet {
            list-style-type: circle;
        }
        ul{
            margin:0px;
            display: contents;
            list-style-type: square;
        }
    </style>
    """
    return HTML(styles)

css_styling()

# 🌑 Files 🌒

In [None]:
train_adc_info = pd.read_csv(f'{comp_dir}/train_adc_info.csv')
test_adc_info = pd.read_csv(f'{comp_dir}/test_adc_info.csv')
train_labels = pd.read_csv(f'{comp_dir}/train_labels.csv')
wavelengths = pd.read_csv(f'{comp_dir}/wavelengths.csv')
axis_info = pd.read_parquet(f'{comp_dir}/axis_info.parquet')
files = [x.replace(f'{comp_dir}/','') for x in sorted(glob(f'{comp_dir}/*.*'))]
render(files, 'insight')
files = [x.replace(f'{comp_dir}/','') for x in sorted(glob(f'{comp_dir}/train/100468857/**/*.*', recursive=True))]
render(files, 'insight')

In [None]:
render('Exoplanets Folder sizes')
def get_exoplanets_sizes(exoplanets_dir):
    metadata = []
    for entry in os.scandir(exoplanets_dir):
        if entry.is_dir():
            stat = entry.stat()
            metadata.append({
                'exoplanet': entry.name,
                'size': sum(os.path.getsize(os.path.join(dp, f)) for dp, dn, fn in os.walk(entry.path) for f in fn)/(1024**2),
                'created': pd.to_datetime(stat.st_ctime, unit='s'),
                'modified': pd.to_datetime(stat.st_mtime, unit='s')
            })
    return pd.DataFrame(metadata)


exoplanets_meta_df = get_exoplanets_sizes(f"{comp_dir}/train/")
exoplanets_meta_df.head()

In [None]:
exoplanets_meta_df['ctime'] = exoplanets_meta_df['created'].dt.time
exoplanets_meta_df['mtime'] = exoplanets_meta_df['modified'].dt.time
exoplanets_meta_df = exoplanets_meta_df.sort_values(by=['exoplanet'])
exoplanets_meta_df

In [None]:
sns.boxplot(x=exoplanets_meta_df['size'])

In [None]:
exoplanets_meta_df['ctimestamp'] = exoplanets_meta_df['created'].astype('int64') // 10**9
exoplanets_meta_df['mtimestamp'] = exoplanets_meta_df['modified'].astype('int64') // 10**9
plt.figure(figsize=(10, 25))
plt.plot(exoplanets_meta_df['ctimestamp'], exoplanets_meta_df['exoplanet'])
plt.show()

In [None]:
exoplanets_meta_df['size'].describe()
render(['Size: ~250Mb per explanet', 'Correlation?: name correlate with created timestamp', 'Size Bins: 242-244Mb, 250-252Mb'], 'data-insight')

# 🌒 ADC Info 🌓

In [None]:
test_adc_info.head(5)

In [None]:
train_adc_info.describe()

In [None]:
render(['Exoplanets:673', 'Sensors: FGS1 & CH0', 'Stars: 2', 'Sensor Data: Gain & Offsets'], 'data-insight')

In [None]:
train_adc_info.replace([np.inf, -np.inf], np.nan, inplace=True)
plt.figure(figsize=(20, 10))

plt.subplot(2, 2, 1)
sns.histplot(train_adc_info['FGS1_adc_offset'], kde=True, bins=30, color='teal')
plt.xlabel('FGS1 Offset')
plt.ylabel('freq')

plt.subplot(2, 2, 2)
sns.histplot(train_adc_info['FGS1_adc_gain'], kde=True, bins=15, color='orange')
plt.xlabel('FGS1 Gain')
plt.ylabel('freq')

plt.subplot(2, 2, 3)
sns.histplot(train_adc_info['AIRS-CH0_adc_offset'], kde=True, bins=30, color='purple')
plt.xlabel('CH0 Offset')
plt.ylabel('freq')

plt.subplot(2, 2, 4)
sns.histplot(train_adc_info['AIRS-CH0_adc_gain'], kde=True, bins=15, color='green')
plt.xlabel('CH0 Gain')
plt.ylabel('freq')

plt.show()

In [None]:
render(['FGS1 Offsets: -325 to -425', 'CH0 Offsets: -700 to -1000', 'FGS1 Gains: 5 values', 'CH0 Gains: 2 values'], 'data-insight')

# 🌓 Train Labels 🌔

In [None]:
train_labels.shape

In [None]:
train_labels.head(5)

In [None]:
train_labels.describe()

In [None]:
%%time
plt.figure(figsize=(20, 6))
sns.histplot(train_labels['wl_1'], bins=30, kde=True)
plt.title('wl_1')
plt.ylabel('freq')
plt.show()

plt.figure(figsize=(20, 6))
sns.histplot(train_labels['wl_183'], bins=30, kde=True)
plt.title('wl_183')
plt.ylabel('freq')
plt.show()

In [None]:
plt.figure(figsize=(16, 32))
correlation_matrix = train_labels[[col for col in train_labels.columns if 'wl_' in col]].corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Corr Wavelengths')
plt.show()

In [None]:
render(['is targets too correlate each other?: each wl well correlate with few wl itself'], 'data-insight')

In [None]:
from sklearn.manifold import TSNE
X = train_labels.drop('planet_id', axis=1)
ids = train_labels['planet_id']
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)

palette = sns.color_palette("hsv", len(ids))
color_map = {planet_id: palette[i] for i, planet_id in enumerate(ids)}

cluster_df = pd.DataFrame(X_tsne, columns=['TSNE1', 'TSNE2'])
cluster_df['planet_id'] = ids
plt.figure(figsize=(20, 50))
plt.scatter(cluster_df['TSNE1'], cluster_df['TSNE2'], alpha=0.6, label='platent_id', color=palette)
plt.show()

In [None]:
render(['Exoplanets: is exoplanets making sign curve as per the wavelenghts? - pattern clusters'], 'data-insight')

In [None]:
unique_means = sorted(list(set([round(x,5) for x in list(train_labels.describe().loc[['mean']].values)[0][1:]])))
unique_std = sorted(list(set([round(x,5) for x in list(train_labels.describe().loc[['std']].values)[0][1:]])))
render(['Targets: wl_1 to wl_283', f'Means: {unique_means}', f'Std: {unique_std}'], 'data-insight')

# 🌔 Wavelenghts & Axis Info 🌕

In [None]:
wavelengths.head(5)

In [None]:
wavelengths.T.describe()

In [None]:
wavelengths_diff = sorted(list(set([ round(x[0],4) for x in list(wavelengths.T.diff().values) ])))
render(['wavelengths: 283', 'min wavelenght: 0.705', 'max wavelenght: 3.895036', f'wavelenght diff: {wavelengths_diff}'], 'data-insight')

In [None]:
axis_info.head(5)

In [None]:
axis_info.shape

In [None]:
axis_info.describe()

In [None]:
plt.figure(figsize=(20, 5))

plt.plot(axis_info['AIRS-CH0-axis0-h'], label='AIRS-CH0-axis0-h', color='blue')
plt.plot(axis_info['FGS1-axis0-h'], label='FGS1-axis0-h', color='red')
plt.show()

In [None]:
render(["Axis: AIRS-CH0-axis0-h, AIRS-CH0-axis2-um, AIRS-CH0-integration_time, FGS1-axis0-h", "Total CHO: 11_250","Total FGS1: 135_000", "Both FGS1 & CHO: Start at 0 and ends at ~7.5"], "data-insight")

# 🌕 FGS1 & CH0 Signal 🌖

In [None]:
render("FGS1 Signal")
fgs1_signal_path = f'{comp_dir}/train/100468857/FGS1_signal.parquet'
fgs1_signal_df = pd.read_parquet(fgs1_signal_path)

fgs1_signal_df.replace([np.inf, -np.inf], np.nan, inplace=True)

fgs1_signal_df.head()

In [None]:
fgs1_signal_df.describe()

In [None]:
fgs1_image = fgs1_signal_df.iloc[0, :].values.reshape(32, 32)
plt.figure(figsize=(10, 10))
plt.imshow(fgs1_image, cmap='inferno')
plt.colorbar()
plt.show()

In [None]:
render("CH0 Signal")
ch0_signal_path = f'{comp_dir}/train/100468857/AIRS-CH0_signal.parquet'
ch0_signal_df = pd.read_parquet(ch0_signal_path)

ch0_signal_df.replace([np.inf, -np.inf], np.nan, inplace=True)

ch0_signal_df.head()

In [None]:
ch0_signal_df.describe()

In [None]:
ch0_image = ch0_signal_df.iloc[0, :].values.reshape(32, 356)

plt.figure(figsize=(20, 5))
plt.imshow(ch0_image, cmap='viridis')
plt.colorbar()
plt.show()

# 🌖 Calibrations 🌗

In [None]:
render("CH0 Dark")
ch0_dark_path = f'{comp_dir}/train/100468857/AIRS-CH0_calibration/dark.parquet'
ch0_dark_df = pd.read_parquet(ch0_dark_path)
ch0_dark_df.replace([np.inf, -np.inf], np.nan, inplace=True)

ch0_dark_df.head()

In [None]:
ch0_dark_df.describe()

In [None]:
render("CH0 Dead")
ch0_dead_path = f'{comp_dir}/train/100468857/AIRS-CH0_calibration/dead.parquet'
ch0_dead_df = pd.read_parquet(ch0_dead_path)
ch0_dead_df.replace([np.inf, -np.inf], np.nan, inplace=True)
display(ch0_dead_df.shape)
display(ch0_dead_df.head())

display(ch0_dead_df.describe())

cho_dead_image = ch0_dead_df.iloc[0, :].values.reshape(1, 356)

plt.figure(figsize=(12, 6))
plt.imshow(cho_dead_image, cmap='cividis')
plt.colorbar()
plt.show()

In [None]:
render("CH0 Dead")
ch0_flat_path = f'{comp_dir}/train/100468857/AIRS-CH0_calibration/flat.parquet'
ch0_flat_df = pd.read_parquet(ch0_flat_path)
ch0_flat_df.replace([np.inf, -np.inf], np.nan, inplace=True)
display(ch0_flat_df.shape)
display(ch0_flat_df.head())

display(ch0_flat_df.describe())

cho_flat_image = ch0_flat_df.iloc[0, :].values.reshape(1, 356)

plt.figure(figsize=(12, 6))
plt.imshow(cho_flat_image, cmap='cividis')
plt.colorbar()
plt.show()

In [None]:
render("CH0 Dead")
ch0_corr_path = f'{comp_dir}/train/100468857/AIRS-CH0_calibration/linear_corr.parquet'
ch0_corr_df = pd.read_parquet(ch0_corr_path)
ch0_corr_df.replace([np.inf, -np.inf], np.nan, inplace=True)
display(ch0_corr_df.shape)
display(ch0_corr_df.head())

display(ch0_corr_df.describe())

cho_corr_image = ch0_corr_df.iloc[0, :].values.reshape(1, 356)

plt.figure(figsize=(12, 6))
plt.imshow(cho_corr_image, cmap='cividis')
plt.colorbar()
plt.show()

In [None]:
render("CH0 Dead")
ch0_read_path = f'{comp_dir}/train/100468857/AIRS-CH0_calibration/read.parquet'
ch0_read_df = pd.read_parquet(ch0_read_path)
ch0_read_df.replace([np.inf, -np.inf], np.nan, inplace=True)
display(ch0_read_df.shape)
display(ch0_read_df.head())

display(ch0_read_df.describe())

cho_read_image = ch0_read_df.iloc[0, :].values.reshape(1, 356)

plt.figure(figsize=(12, 6))
plt.imshow(cho_read_image, cmap='cividis')
plt.colorbar()
plt.show()

# 🌗 Stars 🌘

In [None]:
train_adc_info['star'] =  train_adc_info['star'].apply(lambda x: f'Star {x}')

In [None]:
train_adc_info.groupby('star')['planet_id'].count()

In [None]:
star_0_planets = set(train_adc_info[train_adc_info['star'] == 0]['planet_id'])
star_1_planets = set(train_adc_info[train_adc_info['star'] == 1]['planet_id'])

"Total common planets between 2 stars :", len(star_0_planets.intersection(star_1_planets))

In [None]:
plt.figure(figsize=(20, 5))
sns.histplot(data=train_adc_info, x='FGS1_adc_offset', hue=train_adc_info['star'])
plt.xlabel('FGS1_adc_offset')
plt.show()

In [None]:
plt.figure(figsize=(20, 5))
sns.boxplot(data=train_adc_info, x=train_adc_info['star'], y='FGS1_adc_offset')
plt.xlabel('')
plt.ylabel('FGS1_adc_offset')
plt.show()

In [None]:
plt.figure(figsize=(20, 5))
sns.histplot(data=train_adc_info, x='FGS1_adc_gain', hue=train_adc_info['star'])
plt.xlabel('FGS1_adc_gain')
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.boxplot(data=train_adc_info, x=train_adc_info['star'], y='FGS1_adc_gain')
plt.xlabel('')
plt.show()

In [None]:
plt.figure(figsize=(20, 5))
sns.histplot(data=train_adc_info, x='AIRS-CH0_adc_offset', hue=train_adc_info['star'])
plt.xlabel('AIRS-CH0_adc_offset')
plt.show()

In [None]:
plt.figure(figsize=(20, 5))
sns.boxplot(data=train_adc_info, x=train_adc_info['star'], y='AIRS-CH0_adc_offset')
plt.xlabel('')
plt.show()

In [None]:
plt.figure(figsize=(20, 5))
sns.histplot(data=train_adc_info, x='AIRS-CH0_adc_gain', hue=train_adc_info['star'])
plt.xlabel('AIRS-CH0_adc_gain')
plt.show()

In [None]:
plt.figure(figsize=(20, 20))
sns.boxplot(data=train_adc_info, x=train_adc_info['star'], y='AIRS-CH0_adc_gain')
plt.xlabel('')
plt.show()

# 🌑 🌒 🌓 🌔 End of EDA 🌕 🌖 🌗 🌘 🌑