### Based on all features train logistic regression model

In [1]:
from imports import *
from params import data_path

In [2]:
# Load the data
df = pd.read_csv(data_path)

# Separate the features (X) and the target variable (y)
X = df.drop(['class'], axis=1)
y = df['class']

# Create a Lasso model for feature selection
lasso = LogisticRegression(penalty='l1', solver='liblinear', random_state=42, max_iter=5000)

# Perform feature selection
X = X.to_numpy()
X = StandardScaler().fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = lasso.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy * 100))
print('f1 score: %.2f' % (f1(y_test, yhat, average='weighted') * 100))

## Based on selected features

In [3]:
# Feature selection based on manual study and RandomForest classifier (see feature-selection.ipynb)
X_df = df[['u', 'g', 'i', 'z', 'redshift']]
X_df = X_df[(X_df['redshift'] > 0.001)] # most of the values are very close to zero (<0.00005)

# Remove corresponding Y labels
Y_df = df.loc[X_df.index]['class']

X = X_df.to_numpy()
X = (X - X.mean(axis=0)) / X.std(axis=0)

Y = pd.get_dummies(Y_df.to_numpy())
Y = Y.to_numpy()
Y = np.sum(Y * np.arange(3), axis=1)

In [5]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:

model = lasso.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy * 100))
print('f1 score: %.2f' % (f1(y_test, yhat, average='weighted') * 100))

In [6]:
# train adaboost model
model = AdaBoostClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_train)
# evaluate predictions
accuracy = accuracy_score(y_train, yhat)
print('Accuracy: %.2f' % (accuracy * 100))

Accuracy: 93.90


In [12]:
def preprocess_labels(labels):
    labels = labels.map({'GALAXY': 0, 'QSO': 1, 'STAR': 2})
    return labels.to_numpy()

def data_load(root_dir):
    # Load data
    df = pd.read_csv(root_dir)

    # Split the data into three datasets
    datasets = []
    for i in range(3):
        subset = df[df['class'] == 'GALAXY'].sample(frac=1 / 3)
        subset = pd.concat([subset, df[df['class'] == 'QSO'], df[df['class'] == 'STAR']])
        datasets.append(subset)

    # Preprocess features and labels for each dataset
    features = []
    labels = []
    scaler = StandardScaler()
    for dataset in datasets:
        labels.append(preprocess_labels(dataset['class']))
        subset_features = dataset[['u', 'g', 'i', 'z', 'redshift']].to_numpy()
        subset_features = scaler.fit_transform(subset_features)
        features.append(subset_features)

    # Assign variables for each dataset
    features1, features2, features3 = features
    labels1, labels2, labels3 = labels

    datasets_divided = [(features1, labels1), (features2, labels2), (features3, labels3)]
    return datasets_divided

In [23]:

# train adaboost model
model = AdaBoostClassifier(n_estimators=50, algorithm='SAMME')
model.fit(x_train, y_train)
# evaluate the model
yhat = model.predict(x_train)
# evaluate predictions
accuracy = accuracy_score(y_train, yhat)
print('Accuracy: %.2f' % (accuracy * 100))

Accuracy: 95.52


In [25]:
import json
from params import logs_path

# Load the JSON data for each model
xgb_results = json.load(open(f'{logs_path}/XGBClassifier_results.json'))
catboost_results = json.load(open(f'{logs_path}/CatBoostClassifier_results.json'))
rf_results = json.load(open(f'{logs_path}/RandomForestClassifier_results.json'))

# Function to find the best parameters and test score for a given model
def get_best_parameters(model_results):
    # Extract the test scores and parameters
    test_scores = model_results['mean_test_score']
    parameters = model_results['params']

    # Find the index of the best test score
    best_index = max(range(len(test_scores)), key=test_scores.__getitem__)

    # Get the parameters of the model with the highest test score
    best_parameters = parameters[best_index]

    # Return the best parameters and test score
    return best_parameters, test_scores[best_index]

# Get the best parameters and test scores for each model
xgb_best_parameters, xgb_best_score = get_best_parameters(xgb_results)
catboost_best_parameters, catboost_best_score = get_best_parameters(catboost_results)
rf_best_parameters, rf_best_score = get_best_parameters(rf_results)

# Print the results for each model
print("XGBoost - Best Parameters:")
print(xgb_best_parameters)
print("XGBoost - Best Test Score:")
print(xgb_best_score)
print()

print("CatBoost - Best Parameters:")
print(catboost_best_parameters)
print("CatBoost - Best Test Score:")
print(catboost_best_score)
print()

print("Random Forest - Best Parameters:")
print(rf_best_parameters)
print("Random Forest - Best Test Score:")
print(rf_best_score)

XGBoost - Best Parameters:
{'colsample_bytree': 0.9, 'gamma': 0.3, 'learning_rate': 0.05, 'max_depth': 20, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.6}
XGBoost - Best Test Score:
0.9733145602120259

CatBoost - Best Parameters:
{'l2_leaf_reg': 5, 'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 300}
CatBoost - Best Test Score:
0.9730495279112141

Random Forest - Best Parameters:
{'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}
Random Forest - Best Test Score:
0.9740765280768593


In [None]:
ds = data_load(data_path)
x, y = ds[0]

In [27]:
import json
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

# Load the JSON data for each model
xgb_results = json.load(open(f'{logs_path}/XGBClassifier_results.json'))
catboost_results = json.load(open(f'{logs_path}/CatBoostClassifier_results.json'))
rf_results = json.load(open(f'{logs_path}/RandomForestClassifier_results.json'))

# Function to train the model with the best parameters and return the model instance
def train_model_with_best_parameters(model_class, best_parameters, x_train, y_train):
    model = model_class(**best_parameters)
    model.fit(x_train, y_train)
    return model

# Function to perform cross-validation and return the train and validation scores
def perform_cross_validation(model, x_train, y_train):
    train_scores = cross_val_score(model, x_train, y_train, cv=5)
    return train_scores

# Load the data
ds = data_load(data_path)
x, y = ds[0]

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Train XGBoost with best parameters
xgb_best_parameters = xgb_results['params'][xgb_results['rank_test_score'].index(1)]
xgb_model = train_model_with_best_parameters(XGBClassifier, xgb_best_parameters, x_train, y_train)

# Train CatBoost with best parameters
catboost_best_parameters = catboost_results['params'][catboost_results['rank_test_score'].index(1)]
catboost_model = train_model_with_best_parameters(CatBoostClassifier, catboost_best_parameters, x_train, y_train)

# Train Random Forest with best parameters
rf_best_parameters = rf_results['params'][rf_results['rank_test_score'].index(1)]
rf_model = train_model_with_best_parameters(RandomForestClassifier, rf_best_parameters, x_train, y_train)

0:	learn: 0.9335234	total: 472ms	remaining: 2m 21s
1:	learn: 0.8070905	total: 857ms	remaining: 2m 7s
2:	learn: 0.7081563	total: 1.21s	remaining: 1m 59s
3:	learn: 0.6264191	total: 1.64s	remaining: 2m 1s
4:	learn: 0.5598679	total: 2.02s	remaining: 1m 59s
5:	learn: 0.5022960	total: 2.39s	remaining: 1m 56s
6:	learn: 0.4534296	total: 2.75s	remaining: 1m 55s
7:	learn: 0.4116157	total: 2.76s	remaining: 1m 40s
8:	learn: 0.3750065	total: 3.13s	remaining: 1m 41s
9:	learn: 0.3436681	total: 3.49s	remaining: 1m 41s
10:	learn: 0.3164616	total: 3.85s	remaining: 1m 41s
11:	learn: 0.2928332	total: 4.23s	remaining: 1m 41s
12:	learn: 0.2716606	total: 4.6s	remaining: 1m 41s
13:	learn: 0.2528299	total: 4.96s	remaining: 1m 41s
14:	learn: 0.2363637	total: 5.33s	remaining: 1m 41s
15:	learn: 0.2214596	total: 5.69s	remaining: 1m 41s
16:	learn: 0.2082959	total: 6.06s	remaining: 1m 40s
17:	learn: 0.1965384	total: 6.43s	remaining: 1m 40s
18:	learn: 0.1863720	total: 6.8s	remaining: 1m 40s
19:	learn: 0.1771131	total

In [29]:
# Perform cross-validation on the train set
xgb_train_scores = perform_cross_validation(xgb_model, x_train, y_train)
catboost_train_scores = perform_cross_validation(catboost_model, x_train, y_train)
rf_train_scores = perform_cross_validation(rf_model, x_train, y_train)

# Make predictions on the test data
xgb_predictions = xgb_model.predict(x_test)
catboost_predictions = catboost_model.predict(x_test)
rf_predictions = rf_model.predict(x_test)

# Print the train and validation scores
print("Train and Validation Scores:")
print("XGBoost - Train Scores:", xgb_train_scores)
print("CatBoost - Train Scores:", catboost_train_scores)
print("Random Forest - Train Scores:", rf_train_scores)

# Print the predictions on the test data
print("Predictions on Test Data:")
print("XGBoost Predictions:", xgb_predictions)
print("CatBoost Predictions:", catboost_predictions)
print("Random Forest Predictions:", rf_predictions)

0:	learn: 0.9344390	total: 682ms	remaining: 3m 24s
1:	learn: 0.8092977	total: 1.12s	remaining: 2m 46s
2:	learn: 0.7115882	total: 1.47s	remaining: 2m 25s
3:	learn: 0.6288261	total: 1.83s	remaining: 2m 15s
4:	learn: 0.5621094	total: 2.2s	remaining: 2m 9s
5:	learn: 0.5050435	total: 2.58s	remaining: 2m 6s
6:	learn: 0.4562010	total: 2.94s	remaining: 2m 3s
7:	learn: 0.4148271	total: 3.31s	remaining: 2m
8:	learn: 0.3791420	total: 3.66s	remaining: 1m 58s
9:	learn: 0.3483340	total: 4.03s	remaining: 1m 56s
10:	learn: 0.3204454	total: 4.4s	remaining: 1m 55s
11:	learn: 0.2959068	total: 4.77s	remaining: 1m 54s
12:	learn: 0.2744242	total: 5.14s	remaining: 1m 53s
13:	learn: 0.2554393	total: 5.51s	remaining: 1m 52s
14:	learn: 0.2386635	total: 5.87s	remaining: 1m 51s
15:	learn: 0.2241940	total: 6.24s	remaining: 1m 50s
16:	learn: 0.2110494	total: 6.6s	remaining: 1m 49s
17:	learn: 0.1991650	total: 6.96s	remaining: 1m 49s
18:	learn: 0.1885322	total: 7.35s	remaining: 1m 48s
19:	learn: 0.1793198	total: 7.77

In [30]:
# get test accuracy
from sklearn.metrics import accuracy_score

xgb_accuracy = accuracy_score(y_test, xgb_predictions)
catboost_accuracy = accuracy_score(y_test, catboost_predictions)
rf_accuracy = accuracy_score(y_test, rf_predictions)
    
print("Test Accuracy:")
print("XGBoost - Test Accuracy:", xgb_accuracy)
print("CatBoost - Test Accuracy:", catboost_accuracy)
print("Random Forest - Test Accuracy:", rf_accuracy)

Test Accuracy:
XGBoost - Test Accuracy: 0.9728341891668048
CatBoost - Test Accuracy: 0.9729170117608084
Random Forest - Test Accuracy: 0.9744078184528739


In [31]:
# get test f1 score
from sklearn.metrics import f1_score

xgb_f1 = f1_score(y_test, xgb_predictions, average='weighted')
catboost_f1 = f1_score(y_test, catboost_predictions, average='weighted')
rf_f1 = f1_score(y_test, rf_predictions, average='weighted')

print("Test F1 Score:")
print("XGBoost - Test F1 Score:", xgb_f1)
print("CatBoost - Test F1 Score:", catboost_f1)
print("Random Forest - Test F1 Score:", rf_f1)

Test F1 Score:
XGBoost - Test F1 Score: 0.9728159578610427
CatBoost - Test F1 Score: 0.9728821199105614
Random Forest - Test F1 Score: 0.9743808243846758


In [32]:
# save train, test, f1 scores to json
import json
from params import logs_path

# calculate average train scores
xgb_train_scores = np.mean(xgb_train_scores)
catboost_train_scores = np.mean(catboost_train_scores)
rf_train_scores = np.mean(rf_train_scores)

# Create a dictionary of the results
results = {
    'xgb_train_scores': xgb_train_scores,
    'catboost_train_scores': catboost_train_scores,
    'rf_train_scores': rf_train_scores,
    'xgb_accuracy': xgb_accuracy,
    'catboost_accuracy': catboost_accuracy,
    'rf_accuracy': rf_accuracy,
    'xgb_f1': xgb_f1,
    'catboost_f1': catboost_f1,
    'rf_f1': rf_f1
}

# Save the results to a JSON file
with open(f'{logs_path}/best_retrained_results.json', 'w') as file:
    json.dump(results, file, indent=4)