In [17]:
import mlflow
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

Setting up MLFlow

In [18]:
mlflow.set_tracking_uri("http://127.0.0.1:8080")
mlflow.autolog()

2024/12/11 20:06:13 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


Read the dataset

In [19]:
file_path = '1600g_filtered_labels.csv'
with open(file_path, 'r') as file:
    cleaned_lines = [line.rstrip(';\n') for line in file]
data = pd.read_csv(file_path, header=None, delimiter=',', engine='python')
del data[data.columns[0]]
data.iloc[:, -1] = data.iloc[:, -1].str.replace(';', '', regex=False)

# Clear data, whitespaces and unecessery symbols
data = data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

print(data.head())

   1               2         3         4              5
0  A  90426757696641  0.314944 -1.022277     -0.3099616
1  A  90426807196641  0.387382 -0.618541   -0.048971802
2  A  90426856696641  0.070999 -0.209480     -0.1959783
3  A  90426906196641  0.037975  0.254976     -0.1565635
4  A  90426955696641  0.073129  0.719431  -0.0010349044


In [20]:
X = data.iloc[:, 1:] # Feature : the rest of the columns
y = data.iloc[:, 0] # Label : the first column
print(X.head())
print(y.head())

                2         3         4              5
0  90426757696641  0.314944 -1.022277     -0.3099616
1  90426807196641  0.387382 -0.618541   -0.048971802
2  90426856696641  0.070999 -0.209480     -0.1959783
3  90426906196641  0.037975  0.254976     -0.1565635
4  90426955696641  0.073129  0.719431  -0.0010349044
0    A
1    A
2    A
3    A
4    A
Name: 1, dtype: object


Now we need to turn categorical data to arithmetic

In [21]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print(y_encoded)

[0 0 0 ... 2 2 2]


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

Normalization of features

In [23]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Train a model using KNeighborsClassifier

In [24]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

2024/12/11 20:06:43 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '51cefbe421b7495eb2901d91998a3f45', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run indecisive-dog-404 at: http://127.0.0.1:8080/#/experiments/0/runs/51cefbe421b7495eb2901d91998a3f45
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/0


Predict and evaluate model

In [25]:
y_pred = model.predict(X_test)

Results


In [26]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Print Labels
print("Label mapping:")
for i, label in enumerate(label_encoder.classes_):
    print(f"{i}: {label}")

Accuracy: 0.8668515950069348

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.82      0.80       716
           1       0.82      0.78      0.80       740
           2       1.00      1.00      1.00       707

    accuracy                           0.87      2163
   macro avg       0.87      0.87      0.87      2163
weighted avg       0.87      0.87      0.87      2163

Label mapping:
0: A
1: B
2: E


Try to finetune the KNeighborsClassifier model 

In [46]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform

In [47]:
param_dist = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': uniform(0.001, 100),             # Continuous range for C
    'solver': ['liblinear', 'saga'],
    'l1_ratio': uniform(0, 1),
}# Range for elasticnet mix

In [48]:
# Initialize Logistic Regression
model = LogisticRegression(max_iter=1000, random_state=42)

# Perform randomized search
random_search = RandomizedSearchCV(model, param_dist, cv=5, scoring='accuracy', n_jobs=-1, n_iter=50, random_state=42)
random_search.fit(X_train, y_train)

2024/12/11 22:23:30 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'f5f590d8e31e4065b093afbf7ab9f90e', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
65 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
65 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\tsaki\PycharmProjects\GyroscopicData\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\tsaki\PycharmProjects\GyroscopicData\.venv\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    retur

🏃 View run clean-sow-506 at: http://127.0.0.1:8080/#/experiments/0/runs/f5f590d8e31e4065b093afbf7ab9f90e
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/0


In [53]:
print("Best Parameters:", random_search.best_params_)
print("Best Cross-Validated Score:", random_search.best_score_)
best_model = random_search.best_estimator_

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Print Labels
print("Label mapping:")
for i, label in enumerate(label_encoder.classes_):
    print(f"{i}: {label}")

Best Parameters: {'C': np.float64(37.455011884736244), 'l1_ratio': np.float64(0.9507143064099162), 'penalty': 'elasticnet', 'solver': 'saga'}
Best Cross-Validated Score: 1.0
Accuracy: 0.994914470642626

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       716
           1       0.99      0.99      0.99       740
           2       1.00      1.00      1.00       707

    accuracy                           0.99      2163
   macro avg       0.99      0.99      0.99      2163
weighted avg       0.99      0.99      0.99      2163

Label mapping:
0: A
1: B
2: E


In [50]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

2024/12/11 22:24:04 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'b5b67eefac834f95a04ef673d17b2160', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run stylish-stag-684 at: http://127.0.0.1:8080/#/experiments/0/runs/b5b67eefac834f95a04ef673d17b2160
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/0


In [51]:
y_pred = model.predict(X_test)

In [54]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Print Labels
print("Label mapping:")
for i, label in enumerate(label_encoder.classes_):
    print(f"{i}: {label}")

Accuracy: 0.994914470642626

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       716
           1       0.99      0.99      0.99       740
           2       1.00      1.00      1.00       707

    accuracy                           0.99      2163
   macro avg       0.99      0.99      0.99      2163
weighted avg       0.99      0.99      0.99      2163

Label mapping:
0: A
1: B
2: E
