In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import xgboost as xgb
import tensorflow as tf
import pickle

from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC, SVC, SVR
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, confusion_matrix
from sklearn.cluster import KMeans, DBSCAN

from keras.models import Sequential
from keras.layers import BatchNormalization, Dense, Dropout, Input

In [None]:
# Load preprocessed data
df = pd.read_csv('data/data_raw.csv')
data = np.load('data/data_clean.npy')

In [None]:
np.random.seed(123)

In [None]:
# Split data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Linear regression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))

intercept = model.intercept_
coefficients = model.coef_

In [None]:
# Logistic regression
model = LogisticRegression()
#model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))#, average='weighted'
#print("Confusion Matrix:")
#print(confusion_matrix(y_test, y_pred))

intercept = model.intercept_[0]
coefficients = model.coef_[0]

In [None]:
# Cross-validation procedure for XGBoost
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')
#xgb_model = xgb.XGBClassifier(objective='binary:logistic')
#xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=5)

param_grid = {
    'n_estimators': [10, 25, 50, 100, 200],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'reg_lambda': [0.01, 0.1, 1, 10, 100], #L2
    'reg_alpha': [0.01, 0.1, 1, 10, 100], #L1
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, n_jobs=-1,
                           scoring='neg_mean_squared_error')#scoring='neg_brier_score')                             

grid_search.fit(X_train, y_train)
best_params= grid_search.best_params_

In [None]:
# Use tuned XGBoost model
xgb_model.set_params(**best_params)
model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
#y_prob = xgb_model.predict_proba(X_test)[:,1]

In [None]:
# Random forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
#rf_model= RandomForestClassifier(n_estimators=100, random_state=42)

# Training the model on the training data
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
#y_prob = rf_model.predict_proba(X_test)[:,1]

"""
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_features': ['auto', 'sqrt'],  # Number of features to consider for the best split
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}
"""

In [None]:
# Use linear SVM after PCA
accuracies = []

pca = PCA(n_components = 200)
pca.fit(X)
print("Portion of explained variance:", pca.explained_variance_ratio_)

for q in range(10, 201, 10):    
    X_train_pca = pca.transform(X_train)[:, :q]
    X_test_pca = pca.transform(X_test)[:, :q]

    svm = LinearSVC(C = 0.01, max_iter = 10**4)
    svm.fit(X_train_pca, y_train)
    
    accuracies.append((svm.score(X_train_pca, y_train), svm.score(X_test_pca, y_test)))

In [None]:
# Use SVM after cross-validation
param_grid = {
    "C": [1, 10, 100], 
    "gamma": [0.001, 0.01, 0.1]
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5) 
grid_search.fit(X_train, y_train)
C, gamma = grid_search.best_params_["C"], grid_search.best_params_["gamma"]

svm = SVC(C=C, gamma=gamma) # default kernel is rbf
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

In [None]:
# MLP implementation
np.random.seed(42)
tf.random.set_seed(42)

model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))
model.add(Dense(32, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(32, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(5, activation="softmax"))
#model.add(Dense(1)) # regression
#model.add(Dense(1, activation='sigmoid')) # binary classification    

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
#model.compile(loss="mean_squared_error2, optimizer="adam") # regression
#model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]) # binary classification

model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_test, y_test))
y_pred = model.predict(X_test)

In [None]:
# k-means clustering
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
labels = kmeans.labels_

plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], marker='o', s=200, edgecolors='k', c='red')
plt.show()

In [None]:
# DBSCAN clustering
dbscan = DBSCAN(eps=1, min_samples=5)
dbscan.fit(X)
labels = dbscan.labels_
centers = dbscan.cluster_centers_

In [None]:
# Store results for creating plots
with open('results.pkl', 'wb') as pickle_file:
    pickle.dump(results, pickle_file)