In [None]:
!pip install autogluon --quiet

## 1. Data Loading

In this step we will load .csv file with data we want to use in this example

In [None]:
import pandas as pd

# loading input data
df = pd.read_csv("Skyserver_df.csv")

# let's take a look at how our input data looks like
df.head()

## 2. Basic data Visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, axes = plt.subplots(nrows=1, ncols=3,figsize=(16, 4))
ax = sns.distplot(df[df['class']=='STAR'].redshift, bins = 30, ax = axes[0], kde = False)
ax.set_title('Star')
ax = sns.distplot(df[df['class']=='GALAXY'].redshift, bins = 30, ax = axes[1], kde = False)
ax.set_title('Galaxy')
ax = sns.distplot(df[df['class']=='QSO'].redshift, bins = 30, ax = axes[2], kde = False)
ax = ax.set_title('QSO')

## 3. Data processing

Data processing is a crucial step in building effective machine learning models. This step involves transforming raw data into a format that is suitable for analysis and modeling. It typically includes tasks such as data cleaning, normalization, feature engineering, and data augmentation.

Data processing is important because the quality of the data used to train a model can have a significant impact on its accuracy and ability to make accurate predictions. By cleaning and preparing the data, we can remove noise and inconsistencies, and highlight relevant patterns and features that the model can learn from.

In this step we will take data processing routine from this notebook and transfer it into our template

Below cell contains DS notebook implementation of data processing step and contains following:
- clean the data by removing not needed columns
- make label encoding of our target column - "class"

In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA

from pickle import dump
from sklearn.model_selection import train_test_split

# dropping not needed columns
df.drop(['objid', 'run', 'rerun', 'camcol', 'field', 'specobjid'], axis=1, inplace=True)
df.head(1)

df_temp = df

# encode class labels to integers
le = LabelEncoder()
y_encoded = le.fit_transform(df_temp['class'])
df_temp['class'] = y_encoded

df = df_temp

# split data into train and test part
X_train, X_test, y_train, y_test = train_test_split(df.drop('class', axis=1), df['class'], test_size = 0.33)

# setting up our target column as first one
# For AutoGluon first column of the input data should have the corresponding target variable.
# The rest of the columns should have the corresponding predictor variable
X_train.insert(0, "class", y_train)
X_test.insert(0, "class", y_test)
X_train.to_csv("train.csv", index=False)
X_test.to_csv("test.csv", index=False)

# saving transformations into s3

In [None]:
from autogluon.tabular import TabularPredictor

In [None]:
display(X_train)

In [None]:
predictor = TabularPredictor(label= "class").fit(X_train)

predictor.leaderboard()

In [None]:
predictor = TabularPredictor(label= "class", ).fit(X_train, 
                                                   presets = ["high_quality"]
                                                  )
predictor.leaderboard()