In [1]:
%matplotlib inline

# Import Dependencies

In [2]:
# System & OS
import warnings
warnings.filterwarnings('ignore')

# Data Analysis
import numpy as np
import pandas as pd

# Data Visualization
from tqdm.notebook import tqdm

# Utility Functions
from utils import *

# Mount Storage

To start, `cd` to where the data are stored.

In [3]:
%cd '../data/'

/Users/zach/Documents/Python/Protostellar-Luminosity/data


# Feature Engineering

Then, read in our previous results from `master_file.csv`.

In [4]:
df = pd.read_csv('master_file.csv', skiprows=1, names=['Model number',
                                                       'Class', 
                                                       'Mass ratio', 
                                                       'Timestep', 
                                                       'Inclination', 
                                                       'Wavelength (cm)', 
                                                       'Flux (erg cm^-2 s^-1)', 
                                                       'L_int (Lsun)'])
df

Unnamed: 0,Model number,Class,Mass ratio,Timestep,Inclination,Wavelength (cm),Flux (erg cm^-2 s^-1),L_int (Lsun)
0,1,0,0.034096,2,5,3.600030,1.171193e-18,0.001486
1,1,0,0.034096,2,5,2.400020,4.342741e-18,0.001486
2,1,0,0.034096,2,5,1.600013,1.734933e-17,0.001486
3,1,0,0.034096,2,5,1.300011,3.665945e-17,0.001486
4,1,0,0.034096,2,5,1.100009,6.839343e-17,0.001486
...,...,...,...,...,...,...,...,...
8993595,33,1,0.885142,379,85,0.000025,2.878049e-13,5.9644
8993596,33,1,0.885142,379,85,0.000020,1.755736e-14,5.9644
8993597,33,1,0.885142,379,85,0.000015,-1.782472e-16,5.9644
8993598,33,1,0.885142,379,85,0.000010,-1.308467e-17,5.9644


Next, perform the following feature modifications:

1. Drop the timestep and inclination columns.
2. Typecast columns with `dtype == object`, to `float64` or `str`.
3. Convert wavelengths from cm to microns. $$\begin{align*}\lambda=\lambda_0\times10^4\end{align*}$$ where $\lambda_0$ denotes the initial wavelength (in cm), and $\lambda$ the corrected wavelength (in microns).
4. Further subdivide class 1 into 1a and 1b. $$\begin{align*}\text{class}=\begin{cases}0,&M<0.5\\1\text{a},&0.5\leq M<0.75\\1\text{b},&M\geq 0.75\end{cases}\end{align*}$$ where $M$ denotes the internal vs final mass ratio.
5. Compute the logged (`log10`) flux and internal luminosities. $$\begin{align*}\begin{split}F&=\log_{10}{\left(F_0\right)}\\L&=\log_{10}{\left(L_0\right)}\end{split}\end{align*}$$where $F_0$ and $F$ represent the flux values in linear and `log10` space, respectively (and equivalently for $L_0$ and $L$).
6. Replace `Inf` values with `NaN`.
7. Drop rows with internal luminosity $L<0.1$.

In [5]:
# Drop irrelevant columns
df.drop(columns=['Timestep', 'Inclination'], inplace=True)

# Typecast objects to float64 or str
df['Mass ratio'] = df['Mass ratio'].apply(lambda x: float(x))
df['L_int (Lsun)'] = df['L_int (Lsun)'].apply(lambda x: float(x))
df['Class'] = df['Class'].apply(lambda x: str(x))

# Convert wavelengths to microns
df['Wavelength (cm)'] = df['Wavelength (cm)'].apply(lambda x: x * 10000.0)
df.rename(columns={'Wavelength (cm)': 'Wavelength (microns)'}, inplace=True)

# Subdivide class 1 into 1a and 1b
df.loc[(df['Mass ratio'] >= 0.5) & (df['Mass ratio'] < 0.75), 'Class'] = '1a'
df.loc[df['Mass ratio'] >= 0.75, 'Class'] = '1b'

# Add columns with logged data
df['log(Flux)'] = np.log10(df['Flux (erg cm^-2 s^-1)'].values)
df['log(L_int)'] = np.log10(df['L_int (Lsun)'].values)

# Replace Inf values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with L_int < 0.1
df = df[df['L_int (Lsun)'] >= 0.1]

# Print number of rows dropped from original dataframe
print(f'Number of rows dropped: {8993600 - len(df)} ({(8993600 - len(df)) / 8993600 * 100:.2f}%)')
df

Number of rows dropped: 1097100 (12.20%)


Unnamed: 0,Model number,Class,Mass ratio,Wavelength (microns),Flux (erg cm^-2 s^-1),L_int (Lsun),log(Flux),log(L_int)
15300,1,0,0.360994,36000.295167,1.707756e-19,2.0869,-18.767574,0.319502
15301,1,0,0.360994,24000.196778,1.187698e-18,2.0869,-17.925294,0.319502
15302,1,0,0.360994,16000.131185,8.241456e-18,2.0869,-17.083996,0.319502
15303,1,0,0.360994,13000.106718,2.219241e-17,2.0869,-16.653795,0.319502
15304,1,0,0.360994,11000.090337,4.919182e-17,2.0869,-16.308107,0.319502
...,...,...,...,...,...,...,...,...
8993595,33,1b,0.885142,0.250002,2.878049e-13,5.9644,-12.540902,0.775567
8993596,33,1b,0.885142,0.200002,1.755736e-14,5.9644,-13.755541,0.775567
8993597,33,1b,0.885142,0.150001,-1.782472e-16,5.9644,,0.775567
8993598,33,1b,0.885142,0.100001,-1.308467e-17,5.9644,,0.775567


# Handle Missing Values

Since non-positive flux values are undefined in `log10` space, there are now rows with missing values (as a consequence of steps 5 & 6) that we must deal with accordingly.

In [6]:
# Print percentage of missing values
check_nan(df)

Percentage of missing values in each column:

log(Flux)                1.78
Model number             0.00
Class                    0.00
Mass ratio               0.00
Wavelength (microns)     0.00
Flux (erg cm^-2 s^-1)    0.00
L_int (Lsun)             0.00
log(L_int)               0.00
dtype: float64



Notice, there are relatively few missing values w.r.t. the full population. Hence, we opt to simply drop the rows containing `NaN` values.

In [7]:
# Drop missing values
df.dropna(axis=0, inplace=True)

# Data Analysis

Finally, we execute the following analysis steps:

1. Visualize flux as a function of luminosity in `log10` space.
2. Compute the associated linear coefficients (i.e. slope, intercept, and corresponding uncertainties).
3. Evaluate the goodness-of-fit (in both linear and `log10` space) via reduced-chi squared. $$\begin{align*}\chi_\nu^2=\frac{\chi^2}{\nu}\end{align*}$$ where $\chi$ represents a weighted sum of squared deviations, and $\nu$ the degree(s) of freedom. Further reading available [here](https://en.wikipedia.org/wiki/Reduced_chi-squared_statistic).
4. Calculate the correlation coefficients (in both linear and `log10` space) between flux and luminosity. $$\begin{align*}r=\frac{\sum_i(L_i-\bar{L})(F_i-\bar{F})}{\sqrt{\sum_i(L_i-\bar{L})^2\sum_i(F_i-\bar{F})^2}}\end{align*}$$ where $\bar{L}$ and $\bar{F}$ denote the mean luminosity and flux values, respectively. Further reading available [here](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient).

Note that these are repeated for each of the 100 wavelengths where data are available, which are further split along the 4 possible classes (0, 1a, 1b, 0 & 1). Additional master plots containing data at all wavelengths were created as well.

In [8]:
# Write column headers for lin_coef.csv & corr_coef.csv
with open('lin_coef.csv', 'w') as coef, open('corr_coef.csv', 'w') as corr:
  coef.write('Class, Wavelength (microns), Slope, Unc_slope, Intercept, Unc_intercept\n')
  corr.write('Class, Wavelength (microns), Reduced chi-squared (linear space), Reduced chi-squared (log10 space), Correlation coefficient (linear space), Correlation coefficient (log10 space)\n')
coef.close()
corr.close()

In [9]:
# Initialize progress bar
with tqdm(total=df['Wavelength (microns)'].nunique() * 4 + 4, leave=True) as pbar:
    # Partition data based on class
    df_0 = df[df['Class'] == '0']
    df_1a = df[df['Class'] == '1a']
    df_1b = df[df['Class'] == '1b']
    for _, wavelength in enumerate(df['Wavelength (microns)'].unique()):
        # Further partition data based on wavelength
        features = df[df['Wavelength (microns)'] == wavelength]
        features_0 = df_0[df_0['Wavelength (microns)'] == wavelength]
        features_1a = df_1a[df_1a['Wavelength (microns)'] == wavelength]
        features_1b = df_1b[df_1b['Wavelength (microns)'] == wavelength]
        # Make class 0 plot
        make_plot(features_0['L_int (Lsun)'], features_0['Flux (erg cm^-2 s^-1)'], features_0['log(L_int)'], features_0['log(Flux)'], nclass='0', wavelength=wavelength)
        # Update progress bar
        pbar.update(1)
        # Make class 1a plot
        make_plot(features_1a['L_int (Lsun)'], features_1a['Flux (erg cm^-2 s^-1)'], features_1a['log(L_int)'], features_1a['log(Flux)'], nclass='1a', wavelength=wavelength)
        # Update progress bar
        pbar.update(1)
        # Make class 1b plot
        make_plot(features_1b['L_int (Lsun)'], features_1b['Flux (erg cm^-2 s^-1)'], features_1b['log(L_int)'], features_1b['log(Flux)'], nclass='1b', wavelength=wavelength)
        # Update progress bar
        pbar.update(1)
        # Make class 0 & 1 plot
        make_plot(features['L_int (Lsun)'], features['Flux (erg cm^-2 s^-1)'], features['log(L_int)'], features['log(Flux)'], nclass='0 & 1', wavelength=wavelength)
        # Update progress bar
        pbar.update(1)
    # Make class 0 master plot
    make_plot(df_0['L_int (Lsun)'], df_0['Flux (erg cm^-2 s^-1)'], df_0['log(L_int)'], df_0['log(Flux)'], nclass='0', wavelength='All')
    # Update progress bar
    pbar.update(1)
    # Make class 1a master plot
    make_plot(df_1a['L_int (Lsun)'], df_1a['Flux (erg cm^-2 s^-1)'], df_1a['log(L_int)'], df_1a['log(Flux)'], nclass='1a', wavelength='All')
    # Update progress bar
    pbar.update(1)
    # Make class 1b master plot
    make_plot(df_1b['L_int (Lsun)'], df_1b['Flux (erg cm^-2 s^-1)'], df_1b['log(L_int)'], df_1b['log(Flux)'], nclass='1b', wavelength='All')
    # Update progress bar
    pbar.update(1)
    # Make class 0 & 1 master plot
    make_plot(df['L_int (Lsun)'], df['Flux (erg cm^-2 s^-1)'], df['log(L_int)'], df['log(Flux)'], nclass='0 & 1', wavelength='All')
    # Update progress bar
    pbar.update(1)

  0%|          | 0/404 [00:00<?, ?it/s]

The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript backend does not support transparency; partially transparent artists will be rendered opaque.
The PostScript back

KeyboardInterrupt: 

<Figure size 432x288 with 0 Axes>