Import required libraries and modules

In [1]:
import numpy as np
import pandas as pd
import joblib
import os

from imblearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score

Import training, validation and testing datasets

In [2]:
# file paths for the datasets
datasets = {}
for i in range(7, 13):
    datasets[f'layer{i}'] = {}
    train_path = f'./../data/layer {i}/layer_{i}_train.csv'
    valid_path = f'./../data/layer {i}/layer_{i}_valid.csv'
    test_path = f'./../data/layer {i}/layer_{i}_test.csv'
    
    datasets[f'layer{i}']['train_data'] = pd.read_csv(train_path)
    datasets[f'layer{i}']['valid_data'] = pd.read_csv(valid_path)
    datasets[f'layer{i}']['test_data'] = pd.read_csv(test_path)

In [3]:
test_data_features_layer7 = datasets['layer7']['test_data'].drop(columns=['label_1', 'label_2', 'label_3', 'label_4'])
test_data_features_layer8 = datasets['layer8']['test_data'].drop(columns=['label_1', 'label_2', 'label_3', 'label_4'])
test_data_features_layer9 = datasets['layer9']['test_data'].drop(columns=['label_1', 'label_2', 'label_3', 'label_4'])
test_data_features_layer10 = datasets['layer10']['test_data'].drop(columns=['label_1', 'label_2', 'label_3', 'label_4'])
test_data_features_layer11 = datasets['layer11']['test_data'].drop(columns=['label_1', 'label_2', 'label_3', 'label_4'])
test_data_features_layer12 = datasets['layer12']['test_data'].drop(columns=['label_1', 'label_2', 'label_3', 'label_4'])

Methods to construct validation and training datasets for each label in a layer

In [4]:
def create_train_valid_dataset(train_valid_dataset):
    dataset = {}
    
    dataset_label1 = train_valid_dataset.drop(columns=['label_2', 'label_3', 'label_4'])
    dataset_label2 = train_valid_dataset.drop(columns=['label_1', 'label_3', 'label_4'])
    dataset_label3 = train_valid_dataset.drop(columns=['label_1', 'label_2', 'label_4'])
    dataset_label4 = train_valid_dataset.drop(columns=['label_1', 'label_2', 'label_3'])
    
    dataset_label1 = dataset_label1.dropna(subset=dataset_label1.columns[-1:], how='any')
    dataset_label2 = dataset_label2.dropna(subset=dataset_label2.columns[-1:], how='any')
    dataset_label3 = dataset_label3.dropna(subset=dataset_label3.columns[-1:], how='any')
    dataset_label4 = dataset_label4.dropna(subset=dataset_label4.columns[-1:], how='any')
    
    dataset_label1 = dataset_label1.fillna(dataset_label1.mean())
    dataset_label2 = dataset_label2.fillna(dataset_label2.mean())
    dataset_label3 = dataset_label3.fillna(dataset_label3.mean())
    dataset_label4 = dataset_label4.fillna(dataset_label4.mean())
    
    dataset['label_1'] = {}
    dataset['label_1']['features'] = dataset_label1.iloc[:, :-1]
    dataset['label_1']['label'] = dataset_label1.iloc[:, -1]
    
    dataset['label_2'] = {}
    dataset['label_2']['features'] = dataset_label2.iloc[:, :-1]
    dataset['label_2']['label'] = dataset_label2.iloc[:, -1]
    
    dataset['label_3'] = {}
    dataset['label_3']['features'] = dataset_label3.iloc[:, :-1]
    dataset['label_3']['label'] = dataset_label3.iloc[:, -1]
    
    dataset['label_4'] = {}
    dataset['label_4']['features'] = dataset_label4.iloc[:, :-1]
    dataset['label_4']['label'] = dataset_label4.iloc[:, -1]

    return dataset

Methods to construct test dataset for a layer

In [5]:
def create_test_dataset(test_dataset):    
    test_dataset = test_dataset.fillna(test_dataset.mean())
    
    return test_dataset

Create datasets

In [6]:
modified_datasets = {}
for i in range(7, 13):
    modified_datasets[f'layer{i}'] = {}
    modified_datasets[f'layer{i}']['train_data'] = create_train_valid_dataset(datasets[f'layer{i}']['train_data'])
    modified_datasets[f'layer{i}']['valid_data'] = create_train_valid_dataset(datasets[f'layer{i}']['valid_data'])
    modified_datasets[f'layer{i}']['test_data'] = create_test_dataset(datasets[f'layer{i}']['test_data'])

In [7]:
combined_dataset = {}

for label in range(1, 5):
    combined_dataset[f'label_{label}'] = {}
    combined_dataset[f'label_{label}']['train'] = {}
    combined_dataset[f'label_{label}']['valid'] = {}
    combined_dataset[f'label_{label}']['test'] = {}
    
    train_features = []
    train_label = []
    valid_features = []
    valid_label = []
    test_features = []
    for layer in range(7, 13):
        train_features.append(modified_datasets[f'layer{layer}']['train_data'][f'label_{label}']['features'])
        train_label.append(modified_datasets[f'layer{layer}']['train_data'][f'label_{label}']['label'])
        valid_features.append(modified_datasets[f'layer{layer}']['valid_data'][f'label_{label}']['features'])
        valid_label.append(modified_datasets[f'layer{layer}']['valid_data'][f'label_{label}']['label'])
        test_features.append(modified_datasets[f'layer{layer}']['test_data'][f'label_{label}'])
        
    combined_dataset[f'label_{label}']['train']['features'] = pd.concat(train_features)
    combined_dataset[f'label_{label}']['train']['label'] = pd.concat(train_label)
    combined_dataset[f'label_{label}']['valid']['features'] = pd.concat(valid_features)
    combined_dataset[f'label_{label}']['valid']['label'] = pd.concat(valid_label)
    combined_dataset[f'label_{label}']['test']['features'] = pd.concat(test_features)

Method to load components

In [8]:
def load_components(label):
    components = {}

    for layer in range(7, 13):       
        components[f'layer{layer}'] = {}
        
        if (layer == 7 and (label == 2 or label == 3)):
            components[f'layer{layer}']['scaler'] = joblib.load(f'./../saved_components/layer{layer}/label_{label}/scaler.pkl')
            components[f'layer{layer}']['model'] = joblib.load(f'./../saved_components/layer{layer}/label_{label}/model.pkl')
        else:
            components[f'layer{layer}']['scaler'] = joblib.load(f'./../saved_components/layer{layer}/label_{label}/scaler.pkl')
            components[f'layer{layer}']['pca'] = joblib.load(f'./../saved_components/layer{layer}/label_{label}/pca.pkl')
            components[f'layer{layer}']['model'] = joblib.load(f'./../saved_components/layer{layer}/label_{label}/model.pkl')
    
    return components

Method to create pipelines

In [9]:
def make_pipelines(components):
    pipilines = {}

    for layer in range(7, 13):
        pipeline = []
        if ('pca' in components[f'layer{layer}']):
            pipilines[f'layer{layer}'] = Pipeline([
                ('scaler', components[f'layer{layer}']['scaler']),
                ('pca', components[f'layer{layer}']['pca']),
                ('classifier', components[f'layer{layer}']['model'])
            ])
        else:
            pipilines[f'layer{layer}'] = Pipeline([
                ('scaler', components[f'layer{layer}']['scaler']),
                ('classifier', components[f'layer{layer}']['model'])
            ])
    
    return pipilines

Method to create voting classifier

In [10]:
def make_ensemble(pipelines):
    ensemble = VotingClassifier(estimators=[
        ('pipeline7', pipelines['layer7']),
        ('pipeline8', pipelines['layer8']),
        ('pipeline9', pipelines['layer9']),
        ('pipeline10', pipelines['layer10']),
        ('pipeline11', pipelines['layer11']),
        ('pipeline12', pipelines['layer12'])
    ], voting='hard')
    
    return ensemble

Method to test the models and make predictions

In [11]:
def validate_and_test(model, data):
    # Make predictions on the test data using the ensemble
    y_valid_pred = model.predict(data['valid']['features'])

    # Evaluate the ensemble on validation data
    accuracy = accuracy_score(data['valid']['label'], y_valid_pred)
    precision = precision_score(data['valid']['label'], y_valid_pred, average='macro', zero_division=1)
    recall = recall_score(data['valid']['label'], y_valid_pred, average='macro')

    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")

    # Predict the test data
    y_test_pred = model.predict(data['test']['features'])
    
    return y_test_pred

# Label 3

Import Pipeline Components

In [16]:
label_3_components = load_components(3)

Create Pipeline

In [17]:
label_3_pipilines = make_pipelines(label_3_components)

In [22]:
label_3_pipilines

{'layer7': Pipeline(steps=[('scaler', StandardScaler()),
                 ('classifier', SVC(C=1000, gamma=0.001))]),
 'layer8': Pipeline(steps=[('scaler', RobustScaler()),
                 ('pca', PCA(n_components=0.95, svd_solver='full')),
                 ('classifier', SVC(C=10, class_weight='balanced'))]),
 'layer9': Pipeline(steps=[('scaler', RobustScaler()),
                 ('pca', PCA(n_components=0.95, svd_solver='full')),
                 ('classifier', SVC(C=10, gamma=0.001))]),
 'layer10': Pipeline(steps=[('scaler', RobustScaler()),
                 ('pca', PCA(n_components=0.95, svd_solver='full')),
                 ('classifier', SVC(C=10, gamma=0.001))]),
 'layer11': Pipeline(steps=[('scaler', StandardScaler()),
                 ('pca', PCA(n_components=0.95, svd_solver='full')),
                 ('classifier', SVC(C=100))]),
 'layer12': Pipeline(steps=[('scaler', StandardScaler()),
                 ('pca', PCA(n_components=0.99, svd_solver='full')),
                 ('

## Model Ensemble

Voting classifier

In [None]:
ensemble = make_ensemble(label_3_pipilines)

ensemble.fit(combined_dataset['label_3']['train']['features'], combined_dataset['label_3']['train']['label'])

joblib.dump(ensemble, './../saved_components/complete/label_3/ensemble.pkl')

Test the model

In [None]:
y_test_pred_label_3 = validate_and_test(ensemble, combined_dataset['label_3'])