In [1]:
#Importing necessary libraries
import tensorflow as tf 
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 
import tensorflow_hub as hub
import os
from tensorflow import keras
from keras.preprocessing import image
from matplotlib.pyplot import imread
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, classification_report , accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [2]:
#Mounting Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
#Loading the train dataset
df_train = pd.read_csv('/content/gdrive/MyDrive/train_NoNTTqq.csv')
print(df_train)

            id  ultraviolet_filter  green_filter  red_filter  \
0            1            17.44385      15.71196    16.14848   
1            2            22.02806      24.01481    21.16334   
2            3            23.07242      21.79252    20.51945   
3            4            23.45985      23.41583    20.36645   
4            5            23.89627      23.18005    21.12911   
...        ...                 ...           ...         ...   
134906  134907            19.24538      18.80673    16.41091   
134907  134908            23.41124      22.59072    22.50731   
134908  134909            21.76064      20.16531    20.07795   
134909  134910            18.55473      17.70518    16.67601   
134910  134911            22.07739      20.55631    20.97874   

        near_infrared_filter       alpha      delta  redshift  stellar  
0                  15.647619  158.167937  29.746275  0.094857        1  
1                  20.214615  145.916931  38.083063  0.361631        1  
2           

In [4]:
#Dropping the id column from the train dataframe
df_train.drop(columns='id', inplace=True)

In [5]:
#Replacing each value of stellar column to be in line with SparseCategoricalCrossentropy
df_train['stellar'].replace({
    1:0,
    2:1,
    3:2
}, inplace=True)

In [6]:
#Loading the test dataset
df_test = pd.read_csv('/content/gdrive/MyDrive/test_SxgqOdc.csv')
df_test

Unnamed: 0,id,ultraviolet_filter,green_filter,red_filter,near_infrared_filter,alpha,delta,redshift
0,134912,19.63144,17.88840,16.45195,16.620047,336.501421,2.415351,0.000290
1,134913,25.74819,22.10760,19.97196,19.179141,210.286161,-1.336858,0.513781
2,134914,22.38767,20.85446,20.75418,20.073627,262.914770,46.025803,0.985297
3,134915,22.03212,23.15455,21.86528,20.746343,146.381732,38.368224,0.712391
4,134916,25.01815,24.53933,22.03248,22.377272,246.456081,30.515558,0.495552
...,...,...,...,...,...,...,...,...
89936,224848,22.41481,21.55370,19.60544,18.617794,1.108111,30.528644,0.000479
89937,224849,25.69069,22.74517,21.85320,21.419104,155.468306,37.207024,0.649914
89938,224850,20.79857,19.29775,16.87349,16.802314,186.069454,23.731177,0.370957
89939,224851,22.15261,20.10221,19.14552,18.521452,193.814800,29.493972,0.000959


In [7]:
#Storing the columns of train dataframe other than stellar in X and storing the stellar column of train dataframe in Y
X = df_train.drop('stellar', axis='columns')
Y = df_train.stellar

In [8]:
#Train test split
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(X,Y,test_size=0.00001)

In [9]:
#Dropping the id colum of test dataframe
df_test_p = df_test.drop('id', axis='columns')
df_test_p

Unnamed: 0,ultraviolet_filter,green_filter,red_filter,near_infrared_filter,alpha,delta,redshift
0,19.63144,17.88840,16.45195,16.620047,336.501421,2.415351,0.000290
1,25.74819,22.10760,19.97196,19.179141,210.286161,-1.336858,0.513781
2,22.38767,20.85446,20.75418,20.073627,262.914770,46.025803,0.985297
3,22.03212,23.15455,21.86528,20.746343,146.381732,38.368224,0.712391
4,25.01815,24.53933,22.03248,22.377272,246.456081,30.515558,0.495552
...,...,...,...,...,...,...,...
89936,22.41481,21.55370,19.60544,18.617794,1.108111,30.528644,0.000479
89937,25.69069,22.74517,21.85320,21.419104,155.468306,37.207024,0.649914
89938,20.79857,19.29775,16.87349,16.802314,186.069454,23.731177,0.370957
89939,22.15261,20.10221,19.14552,18.521452,193.814800,29.493972,0.000959


In [10]:
#Standard Scaling
from sklearn.preprocessing import StandardScaler    
st_x= StandardScaler()  
x_train_scaled= st_x.fit_transform(x_train)    
x_test_scaled= st_x.transform(df_test_p)
x_val_scaled = st_x.transform(x_val)

In [11]:
#RandomForest Classifier
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier(n_estimators=150)
model1.fit(x_train_scaled, y_train) 
model1.score(x_val_scaled, y_val) 

1.0

In [12]:
#Storing the predicted probabilities obtained using RandomForest Classifier for test data
pred_prob_test_rf = np.array(model1.predict_proba(x_test_scaled))
print(pred_prob_test_rf)

[[0.22       0.78       0.        ]
 [0.99333333 0.00666667 0.        ]
 [0.35333333 0.01333333 0.63333333]
 ...
 [0.96666667 0.02666667 0.00666667]
 [0.         1.         0.        ]
 [0.98       0.         0.02      ]]


In [13]:
#Adaboost Classifier
from sklearn.ensemble import AdaBoostClassifier
adaboost = AdaBoostClassifier(n_estimators=150, base_estimator= None,learning_rate=0.1, random_state = 1)
adaboost.fit(x_train_scaled,y_train)

print(adaboost.score(x_val_scaled, y_val))

1.0


In [14]:
#Storing the predicted probabilities obtained using Adaboost Classifier for test data
pred_prob_test_ada = np.array(adaboost.predict_proba(x_test_scaled))
print(pred_prob_test_ada)

[[0.32664471 0.39799928 0.27535601]
 [0.40629524 0.31226769 0.28143707]
 [0.35767788 0.28638941 0.3559327 ]
 ...
 [0.39292808 0.30797366 0.29909826]
 [0.31345414 0.4008653  0.28568057]
 [0.39364782 0.30710555 0.29924663]]


In [15]:
#LGBM Classifier
import lightgbm as ltb
model2 = ltb.LGBMClassifier(learning_rate=0.09,max_depth=-5,random_state=42)
model2.fit(x_train_scaled,y_train)
model2.score(x_val_scaled, y_val)

1.0

In [16]:
#Storing the predicted probabilities obtained using LGBM Classifier for test data
pred_prob_test_lgbm = np.array(model2.predict_proba(x_test_scaled))
print(pred_prob_test_lgbm)

[[0.09556193 0.90211772 0.00232035]
 [0.98986189 0.00832446 0.00181365]
 [0.25528232 0.01595383 0.72876385]
 ...
 [0.968851   0.02014417 0.01100483]
 [0.00582342 0.99279998 0.0013766 ]
 [0.98854122 0.0047164  0.00674238]]


In [17]:
#Installing the catboost library
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1-cp37-none-manylinux1_x86_64.whl (76.8 MB)
[K     |████████████████████████████████| 76.8 MB 1.2 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.1


In [18]:
#CatBoost Classifier
import catboost as cb
model3 = cb.CatBoostClassifier()
model3.fit(x_train_scaled,y_train)
model3.score(x_val_scaled, y_val)

Learning rate set to 0.101614
0:	learn: 0.9390627	total: 113ms	remaining: 1m 52s
1:	learn: 0.8203587	total: 173ms	remaining: 1m 26s
2:	learn: 0.7268789	total: 241ms	remaining: 1m 20s
3:	learn: 0.6526309	total: 302ms	remaining: 1m 15s
4:	learn: 0.5903536	total: 362ms	remaining: 1m 12s
5:	learn: 0.5391234	total: 421ms	remaining: 1m 9s
6:	learn: 0.4955855	total: 492ms	remaining: 1m 9s
7:	learn: 0.4591122	total: 552ms	remaining: 1m 8s
8:	learn: 0.4274576	total: 630ms	remaining: 1m 9s
9:	learn: 0.4000714	total: 689ms	remaining: 1m 8s
10:	learn: 0.3769582	total: 761ms	remaining: 1m 8s
11:	learn: 0.3568394	total: 822ms	remaining: 1m 7s
12:	learn: 0.3388272	total: 883ms	remaining: 1m 7s
13:	learn: 0.3234323	total: 944ms	remaining: 1m 6s
14:	learn: 0.3099223	total: 1.01s	remaining: 1m 6s
15:	learn: 0.2981292	total: 1.07s	remaining: 1m 5s
16:	learn: 0.2878836	total: 1.13s	remaining: 1m 5s
17:	learn: 0.2789289	total: 1.25s	remaining: 1m 8s
18:	learn: 0.2708382	total: 1.38s	remaining: 1m 11s
19:	l

1.0

In [19]:
#Storing the predicted probabilities obtained using CatBoost Classifier for test data
pred_prob_test_cat = np.array(model3.predict_proba(x_test_scaled))
print(pred_prob_test_cat)

[[8.70985822e-02 9.10666336e-01 2.23508135e-03]
 [9.92229636e-01 7.09587824e-03 6.74485733e-04]
 [1.90510033e-01 1.39359317e-02 7.95554035e-01]
 ...
 [9.68434976e-01 1.95496934e-02 1.20153305e-02]
 [4.69348071e-03 9.94569890e-01 7.36629674e-04]
 [9.95532230e-01 1.19074301e-03 3.27702708e-03]]


In [20]:
#GradientBoosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(n_estimators=100, max_features=2, max_depth=2, random_state=0)
gb_clf.fit(x_train_scaled,y_train)
gb_clf.score(x_val_scaled, y_val)

1.0

In [21]:
#Storing the predicted probabilities obtained using GradientBoosting Classifier for test data
pred_prob_test_gb = np.array(gb_clf.predict_proba(x_test_scaled))
print(pred_prob_test_gb)

[[0.06921404 0.92023109 0.01055487]
 [0.98181162 0.01262198 0.00556639]
 [0.34209762 0.04653027 0.61137211]
 ...
 [0.96016945 0.02973871 0.01009184]
 [0.01360844 0.98141334 0.00497822]
 [0.97767409 0.01117307 0.01115284]]


In [22]:
#ExtraTrees Classifier
from sklearn.ensemble import ExtraTreesClassifier
etf = ExtraTreesClassifier(n_estimators = 100,criterion ='entropy')
etf.fit(x_train_scaled,y_train)
etf.score(x_val_scaled, y_val)

1.0

In [23]:
#Storing the predicted probabilities obtained using ExtraTrees Classifier for test data
pred_prob_test_etf = np.array(etf.predict_proba(x_test_scaled))
print(pred_prob_test_etf)

[[0.31 0.69 0.  ]
 [0.99 0.   0.01]
 [0.34 0.03 0.63]
 ...
 [0.94 0.03 0.03]
 [0.   1.   0.  ]
 [0.97 0.01 0.02]]


In [24]:
n_epochs_per_model = 40
lr_max = 0.001

In [None]:
from keras.callbacks import Callback
from keras import backend
from keras.models import load_model
from numpy import pi
import math

class SnapshotEnsemble(Callback):
    
    __snapshot_name_fmt = "snapshot_%d.hdf5"
    
    def __init__(self, n_models, n_epochs_per_model, lr_max, verbose=1):
        
        # n_models: number of snapshots
        # n_epochs_per_model: epochs per snapshot
        # lr_max: maximum learning rate
        
        self.n_epochs_per_model = n_epochs_per_model
        self.n_models = n_models
        self.n_epochs_total = self.n_models * self.n_epochs_per_model
        self.lr_max = lr_max
        self.verbose = verbose
        self.lrs = []

    def cosine_annealing(self, epoch):
        cos_inner = (math.pi * (epoch % self.n_epochs_per_model)) / self.n_epochs_per_model
        return self.lr_max / 2 * (math.cos(cos_inner) + 1)

    def on_epoch_begin(self, epoch, logs={}):
        lr = self.cosine_annealing(epoch)
        backend.set_value(self.model.optimizer.lr, lr)
        self.lrs.append(lr)

    def on_epoch_end(self, epoch, logs={}):
        if (epoch + 1) % self.n_epochs_per_model == 0:
            filename = self.__snapshot_name_fmt % ((epoch + 1) // self.n_epochs_per_model)
            self.model.save(filename)
            if self.verbose:
                print('Epoch %d: snapshot saved to %s' % (epoch, filename))
                
    def load_ensemble(self):
        models = []
        for i in range(self.n_models):
            models.append(load_model(self.__snapshot_name_fmt % (i + 1)))
        return models

In [26]:
Snapshot_Ensemble_callback = SnapshotEnsemble(n_models=3, n_epochs_per_model=20, lr_max=0.01)

In [27]:
#ANN
import tensorflow as tf
from tensorflow import keras


model_ann = keras.Sequential([
    keras.layers.Dense(7, input_shape=(7,), activation='relu'),
    keras.layers.Dense(15, activation='relu'),
    keras.layers.Dense(80, activation='relu'), 
    keras.layers.Dense(200, activation='relu'),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(40, activation='relu'),
    keras.layers.Dense(20, activation='relu'),
    keras.layers.Dense(3, activation='softmax')
])

model_ann.compile(optimizer='adam',
              loss='SparseCategoricalCrossentropy',
              metrics=['accuracy'])

model_ann.fit(x_train_scaled, y_train, callbacks = [Snapshot_Ensemble_callback],validation_data=(x_val_scaled, y_val), epochs= 40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7fb240f73f90>

In [28]:
#Storing the predicted probabilities obtained using ANN for test data
pred_prob_test_ann = model_ann.predict(x_test_scaled)
print(pred_prob_test_ann)

[[6.2942110e-02 9.3620700e-01 8.5094135e-04]
 [9.8847586e-01 1.0635775e-02 8.8825234e-04]
 [3.1274137e-01 7.7202404e-03 6.7953837e-01]
 ...
 [9.5337498e-01 4.2527270e-02 4.0977774e-03]
 [2.1943133e-02 9.7703892e-01 1.0179415e-03]
 [9.8134065e-01 1.0600814e-02 8.0584576e-03]]


In [29]:
#Ensembling with experimental weights
pred_ensem_test_final = (4*pred_prob_test_lgbm + 3*pred_prob_test_rf + 20*pred_prob_test_cat + 2*pred_prob_test_etf + 5*pred_prob_test_gb + pred_prob_test_ada+3*pred_prob_test_ann)/38
print(pred_ensem_test_final)

[[0.11225686 0.87762035 0.0101228 ]
 [0.97486372 0.01585527 0.009281  ]
 [0.25204526 0.02591409 0.72204065]
 ...
 [0.94942116 0.03146894 0.01910991]
 [0.01485498 0.97635912 0.00878591]
 [0.9729171  0.01203826 0.01504464]]


In [30]:
pred_ensem_test = []
for i in pred_ensem_test_final:
  pred_ensem_test.append(np.argmax(i)+1)
pred_ensem_test = np.array(pred_ensem_test)
print(pred_ensem_test)

[2 1 3 ... 1 2 1]


In [31]:
#Converting to dataframe
dfx = pd.DataFrame(list(zip(pred_ensem_test)),
               columns = ['stellar'])

In [32]:
dfx.to_csv('SOS_ensemble_final.csv')