In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from tensorflow.keras.optimizers import Adam

print("Libraries loaded!")

Libraries loaded!


In [None]:
# Load the dataset
df = pd.read_csv("binarystars.csv")

In [None]:
# Preprocess numeric columns
numeric_columns = ['period', 'period_err', 'bjd0', 'bjd0_err', 'kmag', 'Teff']
df[numeric_columns] = df[numeric_columns].astype(float)

In [None]:
# Assuming 'morph', 'SC' are binary and the rest are categorical
binary_columns = ['morph', 'SC']
categorical_columns = ['KIC', 'GLon', 'GLat']  # Add other categorical columns if needed
df[binary_columns] = df[binary_columns].astype(bool)

In [None]:
# Define categorical, numeric, and binary columns
categorical_columns = ['KIC', 'morph', 'GLon', 'GLat']
numeric_columns = ['period', 'period_err', 'bjd0', 'bjd0_err', 'kmag', 'Teff']
binary_columns = ['SC']

In [None]:
# Apply one-hot encoding for categorical columns
one_hot_encoder = OneHotEncoder(drop='first', sparse=False)
encoded_categorical_columns = one_hot_encoder.fit_transform(df[categorical_columns])

# Get the categories from the original columns
categories = one_hot_encoder.categories_

# Construct column names
column_names = []
for i, col in enumerate(categorical_columns):
    column_names.extend([f"{col}_{category}" for category in categories[i][1:]])



In [None]:
# Update DataFrame with new column names
encoded_categorical_columns = pd.DataFrame(encoded_categorical_columns, columns=column_names, index=df.index)
df = pd.concat([df.drop(columns=categorical_columns), encoded_categorical_columns], axis=1)

In [None]:
# Splitting the data into features and target
X = df.drop(columns=['SC'], axis=1)  # Features
y = df['SC'].astype(int)  # Target

In [None]:
# Feature scaling
scaler = MinMaxScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

In [None]:
# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print("X_train:")
print(X_train)
print("\nX_val:")
print(X_val)
print("\ny_train:")
print(y_train)
print("\ny_val:")
print(y_val)

X_train:
        period  period_err      bjd0  bjd0_err      kmag      Teff  \
2297  0.014644    0.985569  0.962985  0.230111  0.719940  0.451468   
2281  0.014163    0.985566  0.962189  0.230853  0.617009  0.682844   
1093  0.001177    0.985515  0.962965  0.258070  0.670716  0.000000   
542   0.000361    0.985513  0.962078  0.230267  0.621107  0.538503   
2858  0.257627    0.988267  0.965305  0.229588  0.814001  0.466870   
...        ...         ...       ...       ...       ...       ...   
1638  0.003542    0.985520  0.962337  0.231650  0.772153  0.502059   
1095  0.001177    0.985571  0.962990  0.234151  0.772539  0.591332   
1130  0.001248    0.985515  0.962299  0.227117  0.810674  0.332915   
1294  0.001786    0.985516  0.962087  0.237011  0.633208  0.578976   
860   0.000710    0.985514  0.962284  0.228085  0.644682  0.487822   

      KIC_1026957  KIC_1161345  KIC_1295531  KIC_1432214  ...  GLat_20.4345  \
2297          0.0          0.0          0.0          0.0  ...          

In [None]:
# Reshape data for GRU input
X_train_reshaped = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1))
X_val_reshaped = X_val.values.reshape((X_val.shape[0], X_val.shape[1], 1))

# Build the GRU model with increased complexity and different optimizer
model = Sequential([
    GRU(units=128, return_sequences=True, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])),
    Dropout(0.5),  # Increase dropout rate
    GRU(units=64, return_sequences=True),
    Dropout(0.5),
    GRU(units=32),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [None]:
# Compile the model with Adam optimizer
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
history = model.fit(X_train_reshaped, y_train, epochs=20, batch_size=32, validation_data=(X_val_reshaped, y_val))

Epoch 1/20

KeyboardInterrupt: 

In [None]:
# Plotting the training and validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

In [None]:
# Plotting the training and validation accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.show()

In [None]:
# Evaluate the model
y_pred = model.predict_classes(X_val_scaled.reshape((X_val_scaled.shape[0], X_val_scaled.shape[1], 1)))
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
# Plotting f1-score against the true skill score
plt.scatter(y_pred, y_val)
plt.xlabel('Predictions')
plt.ylabel('True Values')
plt.title('F1-Score vs True Skill Score')
plt.show()

In [None]:
# Classification report
print(classification_report(y_val, y_pred))

In [None]:
# Making predictions
predictions = model.predict_classes(X_val_scaled.reshape((X_val_scaled.shape[0], X_val_scaled.shape[1], 1)))