# Notebook for data exploration and dividing the original dataset into training and test datasets.

In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv("./Data/star_classification.csv")
data

In [None]:
# to see the different values in this column, because in the above display, all
# samples have the same value for this column/feature
data['rerun_ID'].value_counts()

In [None]:
# information about the columns of the dataframe like shape, data type, etc.
data.info()

In [None]:
# statistical information about the data
data.describe()

In [None]:
# to see if there are any NaN values
data.isna().any()

In [None]:
data.drop('obj_ID', axis=1, inplace=True)  # object ID number is not important for analysis, it's a different identification value for each sample
data.drop('rerun_ID', axis=1, inplace=True)  # this column has the same value for all rows
data

In [None]:
# changing class labels to numerical categories
map_to_num = {'GALAXY': 0, 'QSO': 1, 'STAR': 2}
data['class'] = data['class'].map(map_to_num)
data

In [None]:
# divide the labels and features
label_names = data['class'].unique()
feature_columns = data.loc[:, data.columns != 'class']
feature_names = feature_columns.columns
print("label_names: ", label_names)
print("feature_names: ", feature_names)

In [None]:
#Splitting dataset into features (Feature vector - X) and target variables (Labels- Y)
# Feature Vector - X
X = data[feature_names]

# Target Variables- Y
y = data['class']

In [None]:
X

In [None]:
y

In [None]:
# Splitting dataset
# Training - 70%
# Test - 30%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

print("X_train: ", X_train.shape)
X_train.to_csv("./Data/X_train.csv", index=False)
print("X_test: ", X_test.shape)
X_test.to_csv("./Data/X_test.csv", index=False)

print("y_train: ", y_train.shape)
y_train.to_csv("./Data/y_train.csv", index=False)
print("y_test: ", y_test.shape)
y_test.to_csv("./Data/y_test.csv", index=False)