In [2]:
from params import data_path, plots_path
from imports import *

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [2]:
df = pd.read_csv(data_path)

In [3]:
X = df.drop(['class'], axis=1)
y = df['class']

In [4]:
model = RandomForestClassifier(n_estimators=100, random_state=1)

In [5]:
model.fit(X, y)

In [6]:
importance_scores = model.feature_importances_
selector = SelectFromModel(model, threshold=0.05)  # Adjust threshold as needed
X_selected = selector.fit_transform(X, y)
selected_feature_names = X.columns[selector.get_support()]
print(selected_feature_names)

Index(['u', 'g', 'i', 'z', 'redshift'], dtype='object')


In [7]:
importance_scores_normalized = importance_scores / importance_scores.sum()

In [26]:
import plotly.graph_objects as go

# Plot feature importance using plotly
fig = go.Figure()
colorscale = 'Sunset'

fig.add_trace(go.Bar(
    x=importance_scores_normalized,
    y=X.columns,
    orientation='h',
    marker=dict(
        color=importance_scores_normalized,
        colorscale=colorscale
    )
))

fig.update_layout(
    title='Feature Importance',
    xaxis=dict(title='Normalized Importance Score'),
    width=800,
    height=700,
    font=dict(size=16),# Adjust the font size here
    margin=dict(l=150),
)

fig.write_html(f'{plots_path}/feature_importance.html')
fig.show()

In [5]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
# import train split
from sklearn.model_selection import train_test_split

# Load the data
df = pd.read_csv(data_path)

# Separate the features (X) and the target variable (y)
X = df.drop(['class'], axis=1)
y = df['class']

# Create a Lasso model for feature selection
lasso = LogisticRegression(penalty='l1', solver='liblinear', random_state=42, max_iter=5000)

# Perform feature selection
selector = SelectFromModel(lasso)
cols = X.columns
X = X.to_numpy()
# normalize the data using standard scaler
X = pd.DataFrame(StandardScaler().fit_transform(X), columns=cols)
X_selected = selector.fit_transform(X, y)

# Get the importance scores for each feature
feature_importances = selector.estimator_.coef_[0]

# Create a DataFrame to show the feature names and their importance scores
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df.sort_values(by='Importance', ascending=False, inplace=True)

# Print the feature importance scores
print("Feature Importance Scores:")
print(feature_importance_df)

Feature Importance Scores:
        Feature  Importance
4             g   59.001669
12  spec_obj_ID    0.808460
14        plate    0.290130
11     field_ID    0.069085
0        obj_ID    0.046218
8        run_ID    0.043072
16     fiber_ID    0.038327
1         alpha    0.030709
9      rerun_ID    0.000000
2         delta   -0.048044
10      cam_col   -0.074765
13     redshift   -0.275864
3             u   -0.831429
15          MJD   -1.242692
5             r   -1.410886
6             i   -1.754121
7             z   -3.698285


In [26]:
# plot log of feature importance
import plotly.graph_objects as go

# Plot feature importance using plotly
fig = go.Figure()
colorscale = 'Sunset'


fig.add_trace(go.Bar(
    x=feature_importance_df['Importance'],
    y=feature_importance_df['Feature'],
    orientation='h',
    marker=dict(
        color=feature_importance_df['Importance'],
        colorscale=colorscale
    )
))

fig.update_layout(
    title='Feature Importance',
    xaxis=dict(title='Importance Score'),
    width=800,
    height=700,
    font=dict(size=16),# Adjust the font size here
    margin=dict(l=150),
)

fig.show()

In [6]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = lasso.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 95.25


In [7]:
# Feature selection based on manual study and RandomForest classifier (see feature-selection.ipynb)
X_df = df[['u', 'g', 'i', 'z', 'redshift']]
X_df = X_df[(X_df['redshift'] > 0.001)] # most of the values are very close to zero (<0.00005)

# Remove corresponding Y labels
Y_df = df.loc[X_df.index]['class']

X = X_df.to_numpy()
X = (X - X.mean(axis=0)) / X.std(axis=0)

Y = pd.get_dummies(Y_df.to_numpy())
Y = Y.to_numpy()
Y = np.sum(Y * np.arange(3), axis=1)

In [8]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

model = lasso.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 95.76


In [16]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Load the data
df = pd.read_csv(data_path)

# Separate the features (X) and the target variable (y)
X = df.drop(['class'], axis=1)
y = df['class']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Replace nan values with 0
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

# Normalize the data
X_train = (X_train - X_train.mean(axis=0)) / X_train.std(axis=0)
X_test = (X_test - X_test.mean(axis=0)) / X_test.std(axis=0)

# remove outliers

# Create a logistic regression model with L1 regularization
model = LogisticRegression(penalty='l1', solver='liblinear', random_state=42, max_iter=10000)


# Replace nan values with 0
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

# Train the model on the training data
model.fit(X_train, y_train)

# Get the coefficients of the trained model
coefs = model.coef_.flatten()

# Sort the coefficients in descending order
sorted_coefs = np.sort(coefs)[::-1]

# Get the feature names
feature_names = X_train.columns

# Rank the features by their importance
ranked_features = []
for i in range(len(sorted_coefs) - 1):
    try:
        ranked_features.append((feature_names[i], sorted_coefs[i]))
    except IndexError:
        pass

# Print the ranked features
print('Ranked features:')
for feature, importance in ranked_features:
    print('{}: {}'.format(feature, importance))

Ranked features:
obj_ID: 68.62113716295444
alpha: 29.988512760908094
delta: 4.640360189127622
u: 4.6013799051700985
g: 1.583743093085412
r: 1.3234606612616076
i: 0.8317148992666437
z: 0.3848779373375154
run_ID: 0.282788641227865
rerun_ID: 0.27600638467497535
cam_col: 0.26348983895987377
field_ID: 0.11223776555176061
spec_obj_ID: 0.08494865575648221
redshift: 0.06934162971978448
plate: 0.06719611562976754
MJD: 0.044435946178511265
fiber_ID: 0.03775345059333046


In [24]:
import numpy as np

# Calculate the mean and standard deviation of each feature 
means = X.mean(axis=0)
stds = X.std(axis=0)

# Transform features to get unit norm (z-score)
X_norm = (X - means) / stds

# Define a threshold for outliers, e.g. 3 standard deviations
threshold = 3

# Mark outliers as True if z-score is above threshold
outliers = (X_norm > threshold).any(axis=1)

# Filter rows with any outliers
X_clean = X.loc[~outliers,:]

# Remove outliers from the target vector as well
y_clean = y.loc[~outliers]

In [25]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Load the data
df = pd.read_csv(data_path)

# Separate the features (X) and the target variable (y)
X = df.drop(['class'], axis=1)
y = df['class']

# Replace nan values with 0
X_clean.fillna(0, inplace=True)

# Normalize the data
X_clean = (X_clean - X_clean.mean(axis=0)) / X_clean.std(axis=0)

# remove outliers

# Create a logistic regression model with L1 regularization
model = LogisticRegression(penalty='l1', solver='liblinear', random_state=42, max_iter=10000)


# Replace nan values with 0
X_clean.fillna(0, inplace=True)

# Train the model on the training data
model.fit(X_clean, y_clean)

# Get the coefficients of the trained model
coefs = model.coef_.flatten()

# Sort the coefficients in descending order
sorted_coefs = np.sort(coefs)[::-1]

# Get the feature names
feature_names = cols

# Rank the features by their importance
ranked_features = []
for i in range(len(sorted_coefs) - 1):
    try:
        ranked_features.append((feature_names[i], sorted_coefs[i]))
    except IndexError:
        pass

# Print the ranked features
print('Ranked features:')
for feature, importance in ranked_features:
    print('{}: {}'.format(feature, importance))

Ranked features:
obj_ID: 63.8594506271636
alpha: 25.212257816971093
delta: 5.164023221512315
u: 3.822692430229385
g: 1.8551570849125167
r: 1.521360098149611
i: 0.699183128372593
z: 0.5182211464891071
run_ID: 0.40771319003584194
rerun_ID: 0.2894504989625621
cam_col: 0.16182683456292046
field_ID: 0.12555004221594349
spec_obj_ID: 0.09106307219384911
redshift: 0.08460112141634897
plate: 0.057263260743492715
MJD: 0.049549714456508825
fiber_ID: 0.045214164341307636
