# Importing modules

In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from nca_implementation.torchnca import nca

# Loading data

In [2]:
# Read pickles
df_m67 = pd.read_pickle('./data/oc_Messier_67_candidates.pickle')
df_m44 = pd.read_pickle('./data/oc_Messier_44_candidates.pickle')

# Filter based on fidelity_v2 >= 0.7
df_m67 = df_m67[df_m67['fidelity_v2'] >= 0.7]
df_m44 = df_m44[df_m44['fidelity_v2'] >= 0.7]

# Select only relevant columns
SELECTED_COLS = ['ra', 'dec', 'pmra', 'pmdec', 'r_med_geo', 'fidelity_v2', 'cluster_flag'] 
df_m67 = df_m67[SELECTED_COLS]
df_m44 = df_m44[SELECTED_COLS]

# Set cluster candidate information
"""
0 - M67_NOCLUSTER
1 - M67_CLUSTER
2 - M44_NOCLUSTER
3 - M44_CLUSTER
"""
df_m67.loc[df_m67.cluster_flag == 0, 'cluster_flag'] = 0
df_m67.loc[df_m67.cluster_flag == 1, 'cluster_flag'] = 1

df_m44.loc[df_m44.cluster_flag == 0, 'cluster_flag'] = 2
df_m44.loc[df_m44.cluster_flag == 1, 'cluster_flag'] = 3

# Join the data
df = pd.concat([df_m67, df_m44])

In [3]:
print("-------- COLUMN INFORMATION --------")
df.info()

-------- COLUMN INFORMATION --------
<class 'pandas.core.frame.DataFrame'>
Index: 171955 entries, 1 to 110831
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   ra            171955 non-null  float64
 1   dec           171955 non-null  float64
 2   pmra          171955 non-null  float64
 3   pmdec         171955 non-null  float64
 4   r_med_geo     171955 non-null  float32
 5   fidelity_v2   171955 non-null  float32
 6   cluster_flag  171955 non-null  int64  
dtypes: float32(2), float64(4), int64(1)
memory usage: 9.2 MB


In [4]:
print("-------- CLUSTER CANDIDATE INFORMATION --------")
print(df['cluster_flag'].value_counts())

-------- CLUSTER CANDIDATE INFORMATION --------
cluster_flag
0    87711
2    83565
3      679
Name: count, dtype: int64


# Train-test split

In [5]:
# Splitting into features (X) and labels (y)
X = torch.tensor(df.drop(labels='cluster_flag', axis=1).values, dtype=torch.float)
y = torch.tensor(df['cluster_flag'].copy().values)

# Stratified Train-test split (based on labels)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [6]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

torch.Size([137564, 6]) torch.Size([137564]) torch.Size([34391, 6]) torch.Size([34391])


In [7]:
print("-------- TRAIN DATASET LABEL PERCENTAGES --------")
y_train_uniques, y_train_unique_counts = y_train.unique(return_counts=True)
y_train_total = torch.sum(y_train_unique_counts).item()

for item, count in zip(y_train_uniques, y_train_unique_counts):
    print(f"{item}: {(count / y_train_total) * 100:.4f}%")

print("Total Samples: " + str(y_train_total))

print()

print("-------- TEST DATASET LABEL PERCENTAGES --------")
y_test_uniques, y_test_unique_counts = y_test.unique(return_counts=True)
y_test_total = torch.sum(y_test_unique_counts).item()

for item, count in zip(y_test_uniques, y_test_unique_counts):
    print(f"{item}: {(count / y_test_total) * 100:.4f}%")

print("Total Samples: " + str(y_test_total))

-------- TRAIN DATASET LABEL PERCENTAGES --------
0: 51.0083%
2: 48.5970%
3: 0.3947%
Total Samples: 137564

-------- TEST DATASET LABEL PERCENTAGES --------
0: 51.0075%
2: 48.5970%
3: 0.3955%
Total Samples: 34391


In [8]:
nca_pipeline = nca.NCA(dim=2, max_iters=500)
nca_pipeline.train(X_train, y_train)

using random init


: 