In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

In [5]:
df = pd.read_csv('landmarks_dataset_balanced__M_A_W_H.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
X = df.drop('label', axis=1).values
y = df['label'].values


In [6]:
print(y)
print(len(y))
print(len(X))
print(f"Number of CPUs: {num_cpus}")

[1 0 1 ... 1 1 0]
5854
5854


NameError: name 'num_cpus' is not defined

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Parameter grids
param_grid_svc = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf']
}

param_grid_knn = {
    'n_neighbors': [3, 5, 7],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'weights': ['uniform', 'distance']
}

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

param_grid_lr = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['liblinear', 'saga']
}

# Models
svc = SVC()
knn = KNeighborsClassifier()
rf = RandomForestClassifier(random_state=42)
xgb = XGBClassifier(random_state=42)
lr = LogisticRegression(random_state=42)

models = {
    'SVC': (svc, param_grid_svc),
    'KNN': (knn, param_grid_knn),
    'RandomForest': (rf, param_grid_rf),
    'XGBoost': (xgb, param_grid_xgb),
    'LogisticRegression': (lr, param_grid_lr)
}


In [8]:
results = []
best_models = {}
for model_name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    best_models[model_name] = grid_search.best_estimator_

    # Determine the number of splits
    num_splits = len([key for key in grid_search.cv_results_.keys() if key.startswith('split') and key.endswith('_test_score')])

    for i in range(len(grid_search.cv_results_['mean_test_score'])):
        params = grid_search.cv_results_['params'][i]
        mean_score = grid_search.cv_results_['mean_test_score'][i]
        mean_fit_time = grid_search.cv_results_['mean_fit_time'][i]
        mean_score_time = grid_search.cv_results_['mean_score_time'][i]
        total_training_time = mean_fit_time + mean_score_time

        # Collect split scores
        split_scores = {f'split{split_idx}_test_score': grid_search.cv_results_[f'split{split_idx}_test_score'][i] for split_idx in range(num_splits)}

        result = {
            'model': model_name,
            'params': params,
            'mean_test_score': mean_score,
            'mean_fit_time': mean_fit_time,
            'mean_score_time': mean_score_time,
            'total_training_time': total_training_time
        }
        result.update(split_scores)

        results.append(result)

    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy for {model_name}: {grid_search.best_score_}")

# Save the results to an Excel file
results_df = pd.DataFrame(results)
results_df.to_excel('grid_search_results_M_A_W_H.xlsx', index=False)

Best parameters for SVC: {'C': 10, 'kernel': 'poly'}
Best cross-validation accuracy for SVC: 0.8054648916589215


  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters for KNN: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
Best cross-validation accuracy for KNN: 0.8819159696025368
Best parameters for RandomForest: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Best cross-validation accuracy for RandomForest: 0.8861840067063949
Best parameters for XGBoost: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 300}
Best cross-validation accuracy for XGBoost: 0.8968591110382155


60 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Filemon Sitanggang\anaconda3\envs\Skripsi\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Filemon Sitanggang\anaconda3\envs\Skripsi\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Filemon Sitanggang\anaconda3\envs\Skripsi\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, s

Best parameters for LogisticRegression: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best cross-validation accuracy for LogisticRegression: 0.7768529149126164
