In [23]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score


In [16]:
test_data_path = 'test_data.csv'
train_data_path = 'train_data.csv'

df_test = pd.read_csv(test_data_path)
df_train = pd.read_csv(train_data_path)

In [17]:
X = df_train.drop(columns=['label'])
X_test = df_test.drop(columns=['label'])
y = df_train['label']
y_test = df_test['label']
scaler = StandardScaler()
scaler = scaler.fit(X)

X_train = scaler.transform(X)
X_test = scaler.transform(X_test)

In [28]:
pca = PCA(n_components=0.95)
pca = pca.fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

df_train = pd.DataFrame(X_train)
df_train['label'] = y
df_test = pd.DataFrame(X_test)
df_test['label'] = y_test
X_train.shape

X_train = df_train.drop(columns=['label'])
y_train = df_train['label']
X_eval = df_test.drop(columns=['label'])
y_eval = df_test['label']

In [21]:
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}


In [24]:
results = {}
for name, clf in classifiers.items():
    clf = clf.fit(X_train, y)
    predicted_y = clf.predict(X_test)
    accuracy = accuracy_score(y_test, predicted_y)
    results[name] = accuracy
    print(f"{name} accuracy: {accuracy}")


Logistic Regression accuracy: 0.5115740740740741
SVM accuracy: 0.5162037037037037
Random Forest accuracy: 0.5185185185185185
Gradient Boosting accuracy: 0.5254629629629629


In [44]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import optuna
import numpy as np

In [46]:
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'objective': 'multi:softprob',  # for multi-class classification
        'num_class': 2  # update this based on the number of classes in your dataset
    }

    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train)
    preds = model.predict(X_eval)
    preds = np.argmax(preds, axis=1)
    accuracy = accuracy_score(y_eval, preds)
    return accuracy

# Create the study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=5)


# Print best parameters and lowest RMSE
print("Best parameters found: ", study.best_params)
print("Lowest RMSE found: ", study.best_value)

# Evaluate the best model
best_params = study.best_params
best_model = xgb.XGBRegressor(**best_params)
best_model.fit(X_train, y_train)
preds = best_model.predict(X_eval)
rmse = mean_squared_error(y_eval, preds, squared=False)
print(f"RMSE on evaluation set: {rmse}")


[I 2024-05-31 18:52:35,377] A new study created in memory with name: no-name-e6822f43-46a2-41a8-abac-fdad993a1564
[I 2024-05-31 18:52:49,418] Trial 0 finished with value: 0.5300925925925926 and parameters: {'n_estimators': 399, 'max_depth': 3, 'learning_rate': 0.045084470424204236, 'subsample': 0.8119564028117718, 'colsample_bytree': 0.971389077754093}. Best is trial 0 with value: 0.5300925925925926.
[I 2024-05-31 18:52:54,832] Trial 2 finished with value: 0.5115740740740741 and parameters: {'n_estimators': 306, 'max_depth': 9, 'learning_rate': 0.2796622093028397, 'subsample': 0.6322102855937235, 'colsample_bytree': 0.770694962367921}. Best is trial 0 with value: 0.5300925925925926.
[I 2024-05-31 18:53:03,374] Trial 1 finished with value: 0.5046296296296297 and parameters: {'n_estimators': 424, 'max_depth': 10, 'learning_rate': 0.18264795460949276, 'subsample': 0.7234358481419008, 'colsample_bytree': 0.6540864160133646}. Best is trial 0 with value: 0.5300925925925926.
[I 2024-05-31 18:

Best parameters found:  {'n_estimators': 384, 'max_depth': 7, 'learning_rate': 0.014433149705921128, 'subsample': 0.9858116635911145, 'colsample_bytree': 0.7465559304958369}
Lowest RMSE found:  0.5925925925925926
RMSE on evaluation set: 0.5046265576279116


