In [32]:
# Importing the required libraries
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn import metrics
import pandas as pd

# grid search shrinkage and distance metric for nearest centroid
from numpy import arange
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import NearestCentroid
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification
from numpy import mean
from numpy import std
from sklearn.metrics import precision_recall_fscore_support as f1_score
import pickle
import wandb

import wandb
import joblib
import numpy as np
import time
from sklearn.metrics import precision_recall_fscore_support
from sklearn.neighbors import NearestCentroid
from memory_profiler import memory_usage
import os
import pandas as pd
from sklearn.model_selection import train_test_split


import os
import pandas as pd
from Bio import SeqIO
from sklearn.model_selection import train_test_split, cross_val_score
from gen_index import GenIndex
from io_processing import IO_processing

# Data Processesing
This notebook segment outlines the initial steps of preparing a dataset for training a machine learning model, specifically focusing on location-based data. The code is divided into two main parts: data import and preparation of training and test data.

## Data Import
- The dataset is imported from a CSV file named `training_data.csv` using Pandas.
- Any rows with missing values are dropped to ensure data quality and consistency. This is crucial for models that are sensitive to null values.
- A specific preprocessing step is included to address an identified issue: rows where the 'org_location' field is 0.0 are removed. This step is noted as necessary due to a peculiar behavior of some models treating 0.0 as a null value.

## Preparing Training and Test Data
- The dataset is split into features (`x`) and the target variable (`y`), which is 'org_location' in this case.
- The data is then divided into training and test sets using the `train_test_split` method from Scikit-Learn. 20% of the data is reserved for testing, and the split is performed with shuffling to ensure randomization, using a random state of 2 for reproducibility.
- Further, the training set is split again to create a validation set, also using a 20% split with shuffling and the same random state. This additional split is important for model validation during training.
- The training, test, and validation sets are saved into separate CSV files for easy access in later stages of the modeling process.

The print statements at the end of each major section provide a quick summary of the dataset sizes, offering a clear understanding of how much data is available for training, validation, and testing.

In [20]:
import os
import pandas as pd
from io_processing import IO_processing
from gen_index import GenIndex

# Encoding function
def encode(index):
    '''encode the data set using 2bit encoding'''
    encoded = []
    dictionary = {'$': '', ',':'', 'A': '00', 'C': '01', 'G': '10', 'T': '11'}

    if isinstance(index, str):
        transTable = index.maketrans(dictionary)
        txt = index.translate(transTable)
        encoded = txt
    else:
        for row in index:
            encoded_row = row.translate(row.maketrans(dictionary))
            encoded.append(encoded_row)
    return encoded

# Function to extend dataset
def extend_dataset(df, config):
    classes = df['org_location'].unique()
    extended_df = pd.DataFrame()

    for cls in classes:
        cls_df = df[df['org_location'] == cls]
        repeat_times = max(1, config['training_iterations'] - cls_df.shape[0])

        extended_cls_df = pd.concat([cls_df] * repeat_times, ignore_index=True)
        extended_df = pd.concat([extended_df, extended_cls_df], ignore_index=True)

    return extended_df

# Data import and indexing script
config = {
    "seed_length": 28,
    "read_length": 100,
    "training_iterations": 20,
}

def process_fasta_file(file_path, config):
    io_processor = IO_processing()
    sequence_id, sequence_string, sequence_length = io_processor.pharse_reference(file_path, config['seed_length'])
    return sequence_id, sequence_string, sequence_length

def index_sequence(sequence_string, seed_length):
    index_obj = GenIndex()
    index = index_obj.generate_index(sequence_string, seed_length)
    return index

def save_index_as_csv(index, file_name):
    index_df = pd.DataFrame(index)
    index_df.to_csv(file_name, index=False)

def encode_dataframe(df):
    encoded_df = df.copy()
    for column in encoded_df.columns:
        if column != 'org_location':
            encoded_df[column] = encoded_df[column].apply(lambda x: encode(x) if isinstance(x, str) else x)
    return encoded_df

folder_path = 'reference_samples'
fasta_files = [f for f in os.listdir(folder_path) if f.endswith('.fasta')]

for i, fasta_file in enumerate(fasta_files, start=1):
    file_path = os.path.join(folder_path, fasta_file)
    try:
        sequence_id, sequence_string, sequence_length = process_fasta_file(file_path, config)
        index = index_sequence(sequence_string, config['seed_length'])
        
        # Encode the generated index
        encoded_index = encode_dataframe(pd.DataFrame(index))

        # Extend the dataset
        extended_encoded_index = extend_dataset(encoded_index, config)

        # Save the extended encoded index
        extended_output_file_name = os.path.join(folder_path, f'S{i}_extended_encoded_reference.csv')
        save_index_as_csv(extended_encoded_index, extended_output_file_name)

        print(f'Processed {fasta_file}, extended, encoded index, and saved as {extended_output_file_name}\n')
    except Exception as e:
        print(f'Error processing {fasta_file}: {e}')


... Pharsing input reference genome

pharsing reference sequence reference_samples\s1.fasta
sequence ID: CM001012.3
sequence Length: 10000
Making rotations
Building suffix array column
Building BWT
Populating FM index character counts
Generating k-length seeds
Processed s1.fasta, extended, encoded index, and saved as reference_samples\S1_extended_encoded_reference.csv

... Pharsing input reference genome

pharsing reference sequence reference_samples\s2.fasta
sequence ID: ref|NC_000021.9|:25851491-26200185
sequence Length: 10000
Making rotations
Building suffix array column
Building BWT
Populating FM index character counts
Generating k-length seeds
Processed s2.fasta, extended, encoded index, and saved as reference_samples\S2_extended_encoded_reference.csv

... Pharsing input reference genome

pharsing reference sequence reference_samples\s3.fasta
sequence ID: CM007607.1
sequence Length: 10000
Making rotations
Building suffix array column
Building BWT
Populating FM index character coun

In [6]:
# need to modify the trainign data generation function to include details as listed. 

In [22]:
folder_path = 'reference_samples'
reference_files = [f for f in os.listdir(folder_path) if f.endswith('extended_encoded_reference.csv')]

for ref_file in reference_files:
    # Data import
    file_path = os.path.join(folder_path, ref_file)
    df = pd.read_csv(file_path)
    df = df.dropna()
    df = df[df['org_location'] != 0.0]  # Handle the error with models treating 0.0 as null
    df = df.drop(columns=['suffix_array', 'k-seed-extend'])

    # Preparing training and test data
    x = df.loc[:, df.columns != 'org_location']
    y = df.loc[:, 'org_location']
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=2)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, random_state=2)

    # Saving the datasets in the same folder
    base_filename = ref_file.split('_reference.csv')[0]
    X_train.to_csv(os.path.join(folder_path, f"{base_filename}_x_train.csv"), index=False)
    X_test.to_csv(os.path.join(folder_path, f"{base_filename}_X_test.csv"), index=False)
    y_train.to_csv(os.path.join(folder_path, f"{base_filename}_y_train.csv"), index=False)
    y_test.to_csv(os.path.join(folder_path, f"{base_filename}_y_test.csv"), index=False)
    X_val.to_csv(os.path.join(folder_path, f"{base_filename}_X_val.csv"), index=False)
    y_val.to_csv(os.path.join(folder_path, f"{base_filename}_y_val.csv"), index=False)

    print(f"Processed {ref_file}:")
    print(f"  X training dataset length: {len(X_train)}")
    print(f"  y training dataset length: {len(y_train)}")


Processed S1_extended_encoded_reference.csv:
  X training dataset length: 505548
  y training dataset length: 505548
Processed S2_extended_encoded_reference.csv:
  X training dataset length: 505548
  y training dataset length: 505548
Processed S3_extended_encoded_reference.csv:
  X training dataset length: 505548
  y training dataset length: 505548
Processed S4_extended_encoded_reference.csv:
  X training dataset length: 505548
  y training dataset length: 505548


# Model Training
This section of the notebook focuses on the training of multiple machine learning models using Scikit-Learn version 0.24.2. The models selected represent different archetypes in machine learning, including Nearest Neighbour, Averaging Methods, Naïve Bayes, and Gradient Boosting. The specific models and their roles are as follows:

## Models Overview

1. **Nearest Centroid (Nearest Neighbour Archetype)**: 
   - This model is a simplistic yet effective approach for classification, based on the concept of the nearest centroid. It can be particularly useful for baseline comparisons.

2. **KNeighbours Classifier (Nearest Neighbour Archetype)**: 
   - A versatile and widely-used model that classifies data based on the closest training examples in the feature space. It's effective for datasets where the decision boundary is irregular.

3. **Random Forest (Averaging Methods Archetype)**: 
   - A robust ensemble learning method, Random Forest constructs a multitude of decision trees during training and outputs the class that is the mode of the classes (classification) of the individual trees.

4. **Extra Trees Classifier (Averaging Methods Archetype)**: 
   - Similar to Random Forest, this model fits a number of randomized decision trees on various sub-samples of the dataset and uses averaging to improve predictive accuracy and control over-fitting.

5. **Decision Tree (Averaging Methods Archetype)**: 
   - A decision tree is a flowchart-like structure where an internal node represents a feature(or attribute), the branch represents a decision rule, and each leaf node represents the outcome.

6. **Gaussian Naïve Bayes (Naïve Bayes Archetype)**: 
   - This model applies the Bayes theorem with the “naive” assumption of independence between every pair of features. Gaussian Naïve Bayes is particularly suited when the features have continuous values.

7. **LGBM Classifier (Gradient Boosting Archetype)**: 
   - Light Gradient Boosting Machine is a gradient boosting framework that uses tree-based learning algorithms. It's designed for distributed and efficient training, particularly on large datasets.

8. **XGB Classifier (Gradient Boosting Archetype)**: 
   - XGBoost (Extreme Gradient Boosting) is an optimized distributed gradient boosting library. It is highly efficient, flexible, and portable, often delivering state-of-the-art performance in many machine learning tasks.

## Training Process
Each model is trained on the prepared dataset, and the performance is evaluated using the validation set. The training involves tuning model parameters to find the optimal configuration for each model. Key metrics like accuracy, precision, recall, and F1 score are used to assess each model's performance. The diversity in the selected models ensures a comprehensive examination of the dataset, as each model type brings its strengths and weaknesses to different types of data. This approach allows for a thorough understanding of which models are best suited for the specific characteristics of the dataset in question.The results from these models can be used to benchmark performance and guide the selection of the most suitable model for deployment.

## Nearest Neighbour
Nearest Neighbor models are a fundamental class of algorithms in the field of machine learning, predominantly used for classification tasks, though they can also be employed for regression. These models operate on the principle of similarity, identifying the closest data points in the training set to make predictions for new, unseen data. The simplicity, intuitiveness, and effectiveness of these models make them an essential part of any data scientist's toolkit.

#### Key Concepts

1. **Basic Principle**: The core idea behind nearest neighbor models is that similar data points are close to each other in the feature space. Therefore, the label or value of a new data point can be predicted based on the labels or values of its nearest neighbors in the training set.

2. **Distance Metrics**: These models rely on distance metrics to determine the closeness of data points. Common metrics include Euclidean, Manhattan, and Hamming distances. The choice of metric can significantly impact the model's performance and is often dictated by the nature of the data.

3. **K-Nearest Neighbors (KNN)**: One of the most popular variants is the K-Nearest Neighbors algorithm. KNN uses a predefined number 'K' to determine the number of neighboring data points to consider for making a prediction. The optimal value of 'K' is typically selected through cross-validation.

4. **Nearest Centroid Classifier**: This variant classifies data points based on the closest centroid of the training samples in the feature space. It's particularly useful when the data is well-separated into clusters.

5. **Applications**: Nearest neighbor models are used in a wide range of applications, including image recognition, recommendation systems, and medical diagnosis, where the assumption that similar instances have similar outcomes holds true.

6. **Strengths and Limitations**:
   - **Strengths**: These models are easy to implement, interpret, and don't require assumptions about the underlying data distribution. They're also highly adaptable to changes in the input data.
   - **Limitations**: Nearest neighbor models can suffer from high computational costs, especially with large datasets, and can be sensitive to irrelevant or redundant features.

#### Practical Implementation

In practical scenarios, implementing nearest neighbor models involves careful preprocessing of data, selection of an appropriate distance metric, and tuning of parameters like 'K' in KNN. It's also crucial to scale or normalize the data, as these models are sensitive to the scale of the input features.

Moreover, modern applications might require optimizations for handling large datasets, such as using approximate nearest neighbor search algorithms or efficient data structures like KD-trees and Ball Trees.

In summary, nearest neighbor models are a versatile and straightforward tool for both classification and regression tasks in machine learning. Their ability to adapt to complex, real-world datasets makes them a valuable component of the machine learning workflow.


In [24]:
S1_X_train = pd.read_csv('reference_samples/S1_extended_encoded_X_train.csv')
S1_X_test = pd.read_csv('reference_samples/S1_extended_encoded_X_test.csv')
S1_y_train = pd.read_csv('reference_samples/S1_extended_encoded_y_train.csv')
S1_y_test = pd.read_csv('reference_samples/S1_extended_encoded_y_test.csv')

S2_X_train = pd.read_csv('reference_samples/S2_extended_encoded_X_train.csv')
S2_X_test = pd.read_csv('reference_samples/S2_extended_encoded_X_test.csv')
S2_y_train = pd.read_csv('reference_samples/S2_extended_encoded_y_train.csv')
S2_y_test = pd.read_csv('reference_samples/S2_extended_encoded_y_test.csv')

S3_X_train = pd.read_csv('reference_samples/S3_extended_encoded_X_train.csv')
S3_X_test = pd.read_csv('reference_samples/S3_extended_encoded_X_test.csv')
S3_y_train = pd.read_csv('reference_samples/S3_extended_encoded_y_train.csv')
S3_y_test = pd.read_csv('reference_samples/S3_extended_encoded_y_test.csv')

S4_X_train = pd.read_csv('reference_samples/S4_extended_encoded_X_train.csv')
S4_X_test = pd.read_csv('reference_samples/S4_extended_encoded_X_test.csv')
S4_y_train = pd.read_csv('reference_samples/S4_extended_encoded_y_train.csv')
S4_y_test = pd.read_csv('reference_samples/S4_extended_encoded_y_test.csv')


### Nearest Centroid
Nearest Centroid model classifies data by assigning each observation to the class of the nearest centroid, simplifying computation and making it particularly effective for large, well-separated datasets.

In [26]:
# Initialize Weights & Biases
wandb.init(project='seed_generation', notes='multi-model training', tags=['multimodel', 'seed generation'])

datasets = [(S1_X_train, S1_X_test, S1_y_train, S1_y_test),
            (S2_X_train, S2_X_test, S2_y_train, S2_y_test),
            (S3_X_train, S3_X_test, S3_y_train, S3_y_test),
            (S4_X_train, S4_X_test, S4_y_train, S4_y_test)]

aggregate_metrics = {'Precision': [], 'Recall': [], 'F1 Score': [], 'Memory (MB)': [], 'Execution Time (s)': []}

for X_train, X_test, y_train, y_test in datasets:
    # Convert y_train and y_test to 1D array
    y_train = y_train.squeeze()
    y_test = y_test.squeeze()

    start_time = time.time()

    # Measure memory usage
    mem_usage_start = memory_usage(-1)
    model = NearestCentroid(metric='manhattan').fit(X_train, y_train)
    mem_usage_end = memory_usage(-1)
    mem_usage = abs(mem_usage_end[0] - mem_usage_start[0])

    predicted = model.predict(X_test)
    execution_time = time.time() - start_time

    # Metrics Calculation
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, predicted, average='weighted')
    # scores = cross_val_score(model, X_test, y_test, scoring='accuracy', cv=2, n_jobs=-1)

    # Log metrics to wandb
    wandb.log({#'K-Fold mean': np.mean(scores),
               #'k-Fold std': np.std(scores),
               'Training score': model.score(X_train, y_train) * 100,
               'Test score': model.score(X_test, y_test) * 100,
               'Precision': precision,
               'Recall': recall,
               'F1 Score': fscore,
               'Memory (MB)': mem_usage,
               'Execution Time (s)': execution_time
              })

    # Aggregating metrics
    aggregate_metrics['Precision'].append(precision)
    aggregate_metrics['Recall'].append(recall)
    aggregate_metrics['F1 Score'].append(fscore)
    aggregate_metrics['Memory (MB)'].append(mem_usage)
    aggregate_metrics['Execution Time (s)'].append(execution_time)

# Final Aggregation
for metric, values in aggregate_metrics.items():
    wandb.log({f'Aggregate {metric}': np.mean(values)})

wandb.finish()

# Save model (assuming the last trained model is to be saved)
filename = 'model_weights/centroid_model.joblib'
joblib.dump(model, filename)


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

0,1
Aggregate Execution Time (s),▁
Aggregate F1 Score,▁
Aggregate Memory (MB),▁
Aggregate Precision,▁
Aggregate Recall,▁
Execution Time (s),█▁▄▂
F1 Score,▁▁▁▁
Memory (MB),█▁▁▁
Precision,▁▁▁▁
Recall,▁▁▁▁

0,1
Aggregate Execution Time (s),27.45161
Aggregate F1 Score,1.0
Aggregate Memory (MB),0.0625
Aggregate Precision,1.0
Aggregate Recall,1.0
Execution Time (s),24.09222
F1 Score,1.0
Memory (MB),0.00391
Precision,1.0
Recall,1.0


['model_weights/centroid_model.joblib']

### Nearest Neighbour
The Nearest Neighbor algorithm is a simple and intuitive classification method that predicts the label of a new data point based on the most common label among its closest neighbors in the feature space.

In [27]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize Weights & Biases
wandb.init(project='seed_generation', notes='K-Nearest Neighbors training', tags=['knn', 'seed generation'])

datasets = [(S1_X_train, S1_X_test, S1_y_train, S1_y_test),
            (S2_X_train, S2_X_test, S2_y_train, S2_y_test),
            (S3_X_train, S3_X_test, S3_y_train, S3_y_test),
            (S4_X_train, S4_X_test, S4_y_train, S4_y_test)]

aggregate_metrics = {'Precision': [], 'Recall': [], 'F1 Score': [], 'Memory (MB)': [], 'Execution Time (s)': []}

for X_train, X_test, y_train, y_test in datasets:
    # Convert y_train and y_test to 1D array
    y_train = y_train.squeeze()
    y_test = y_test.squeeze()

    start_time = time.time()

    # Measure memory usage
    mem_usage_start = memory_usage(-1)
    model = KNeighborsClassifier(n_neighbors=5, metric='minkowski', weights='uniform').fit(X_train, y_train)
    mem_usage_end = memory_usage(-1)
    mem_usage = abs(mem_usage_end[0] - mem_usage_start[0])

    predicted = model.predict(X_test)
    execution_time = time.time() - start_time

    # Metrics Calculation
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, predicted, average='weighted')

    # Log metrics to wandb
    wandb.log({'Training score': model.score(X_train, y_train) * 100,
               'Test score': model.score(X_test, y_test) * 100,
               'Precision': precision,
               'Recall': recall,
               'F1 Score': fscore,
               'Memory (MB)': mem_usage,
               'Execution Time (s)': execution_time
              })

    # Aggregating metrics
    aggregate_metrics['Precision'].append(precision)
    aggregate_metrics['Recall'].append(recall)
    aggregate_metrics['F1 Score'].append(fscore)
    aggregate_metrics['Memory (MB)'].append(mem_usage)
    aggregate_metrics['Execution Time (s)'].append(execution_time)

# Final Aggregation
for metric, values in aggregate_metrics.items():
    wandb.log({f'Aggregate {metric}': np.mean(values)})

wandb.finish()

# Save the last trained KNN model
filename = 'model_weights/knn_model.joblib'
joblib.dump(model, filename)


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01693333333338766, max=1.0)…

0,1
Aggregate Execution Time (s),▁
Aggregate F1 Score,▁
Aggregate Memory (MB),▁
Aggregate Precision,▁
Aggregate Recall,▁
Execution Time (s),▁█▃▁
F1 Score,▁▁▁▁
Memory (MB),█▁▁▁
Precision,▁▁▁▁
Recall,▁▁▁▁

0,1
Aggregate Execution Time (s),4.09961
Aggregate F1 Score,1.0
Aggregate Memory (MB),11.33105
Aggregate Precision,1.0
Aggregate Recall,1.0
Execution Time (s),3.87623
F1 Score,1.0
Memory (MB),0.99609
Precision,1.0
Recall,1.0


['model_weights/knn_model.joblib']

### Random Forest

Random Forest is an ensemble learning method that constructs a multitude of decision trees during training and outputs the mode of the classes (for classification) or mean prediction (for regression) of the individual trees, thereby enhancing predictive accuracy and robustness against overfitting.

In [29]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Weights & Biases
wandb.init(project='seed_generation', notes='Random Forest training', tags=['random_forest', 'seed generation'])

datasets = [(S1_X_train, S1_X_test, S1_y_train, S1_y_test),
            (S2_X_train, S2_X_test, S2_y_train, S2_y_test),
            (S3_X_train, S3_X_test, S3_y_train, S3_y_test),
            (S4_X_train, S4_X_test, S4_y_train, S4_y_test)]

aggregate_metrics = {'Precision': [], 'Recall': [], 'F1 Score': [], 'Memory (MB)': [], 'Execution Time (s)': []}

for X_train, X_test, y_train, y_test in datasets:
    # Convert y_train and y_test to 1D array
    y_train = y_train.squeeze()
    y_test = y_test.squeeze()

    start_time = time.time()

    # Measure memory usage
    mem_usage_start = memory_usage(-1)
    model = RandomForestClassifier().fit(X_train, y_train)
    mem_usage_end = memory_usage(-1)
    mem_usage = abs(mem_usage_end[0] - mem_usage_start[0])

    predicted = model.predict(X_test)
    execution_time = time.time() - start_time

    # Metrics Calculation
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, predicted, average='weighted')

    # Log metrics to wandb
    wandb.log({'Training score': model.score(X_train, y_train) * 100,
               'Test score': model.score(X_test, y_test) * 100,
               'Precision': precision,
               'Recall': recall,
               'F1 Score': fscore,
               'Memory (MB)': mem_usage,
               'Execution Time (s)': execution_time
              })

    # Aggregating metrics
    aggregate_metrics['Precision'].append(precision)
    aggregate_metrics['Recall'].append(recall)
    aggregate_metrics['F1 Score'].append(fscore)
    aggregate_metrics['Memory (MB)'].append(mem_usage)
    aggregate_metrics['Execution Time (s)'].append(execution_time)

# Final Aggregation
for metric, values in aggregate_metrics.items():
    wandb.log({f'Aggregate {metric}': np.mean(values)})

wandb.finish()

# Save the last trained RandomForest model
filename = 'model_weights/random_forest_model.joblib'
joblib.dump(model, filename)


## Averaging Methods
Averaging Methods, including algorithms like Random Forest and Ensemble Methods, are a critical class of machine learning models known for their robustness and accuracy. These models work by combining the predictions from multiple models to improve the overall performance, especially in terms of variance reduction.

#### Key Concepts

1. **Basic Principle**: Averaging methods involve training multiple models and combining their predictions. The final output is typically the mean (for regression tasks) or mode (for classification tasks) of the predictions from all models. This approach helps in mitigating the effects of overfitting and improving the model's generalization capabilities.

2. **Random Forest**: A popular example of averaging methods, Random Forest builds numerous decision trees and merges their outcomes. It is a form of 'bagging' where each tree is trained on a subset of the data and features, offering a diversified model performance.

3. **Ensemble Learning**: Averaging methods are a subset of ensemble learning, where the objective is to combine the strengths of various models to achieve better accuracy and stability. This includes methods like bagging and boosting.

4. **Boosting**: Another form of ensemble learning where models are trained sequentially with each model learning from the errors of its predecessors, thus focusing more on the challenging parts of the dataset.

5. **Applications**: These methods are extremely versatile and have been successfully applied in various domains, such as finance for risk assessment, healthcare for disease prediction, and natural language processing tasks.

6. **Strengths and Limitations**:
   - **Strengths**: Averaging methods are known for their high accuracy, ability to handle large datasets and feature spaces, and robustness against overfitting. They also work well with non-linear data.
   - **Limitations**: These models can be complex, computationally intensive, and less interpretable compared to simpler models like linear regression or decision trees.

#### Practical Implementation

Implementing averaging methods requires selecting the right base models and determining how to combine their predictions effectively. The choice of base models and the method of combination (like voting or averaging) can significantly impact the performance. It is also crucial to ensure diversity among the base models to maximize the benefits of averaging.

Additionally, hyperparameter tuning plays a significant role in optimizing these models. Techniques like cross-validation are essential for determining the optimal settings for parameters like the number of trees in a Random Forest or the learning rate in boosting algorithms.

In conclusion, averaging methods are a powerful set of tools in the machine learning arsenal, offering enhanced predictive performance and robustness. Their ability to combine multiple models' strengths makes them suitable for a wide range of complex real-world problems.


### Extra Trees classifier

The Extra Trees Classifier is an ensemble machine learning algorithm that operates similarly to a Random Forest but with randomization at the level of individual tree splits, offering increased variance reduction and potentially faster training at the cost of slightly higher bias.

In [30]:
from sklearn.ensemble import ExtraTreesClassifier

# Initialize Weights & Biases
wandb.init(project='seed_generation', notes='Extra Trees Classifier training', tags=['extra_trees', 'seed generation'])

datasets = [(S1_X_train, S1_X_test, S1_y_train, S1_y_test),
            (S2_X_train, S2_X_test, S2_y_train, S2_y_test),
            (S3_X_train, S3_X_test, S3_y_train, S3_y_test),
            (S4_X_train, S4_X_test, S4_y_train, S4_y_test)]

aggregate_metrics = {'Precision': [], 'Recall': [], 'F1 Score': [], 'Memory (MB)': [], 'Execution Time (s)': []}

for X_train, X_test, y_train, y_test in datasets:
    # Convert y_train and y_test to 1D array
    y_train = y_train.squeeze()
    y_test = y_test.squeeze()

    start_time = time.time()

    # Measure memory usage
    mem_usage_start = memory_usage(-1)
    model = ExtraTreesClassifier().fit(X_train, y_train)
    mem_usage_end = memory_usage(-1)
    mem_usage = abs(mem_usage_end[0] - mem_usage_start[0])

    predicted = model.predict(X_test)
    execution_time = time.time() - start_time

    # Metrics Calculation
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, predicted, average='weighted')

    # Log metrics to wandb
    wandb.log({'Training score': model.score(X_train, y_train) * 100,
               'Test score': model.score(X_test, y_test) * 100,
               'Precision': precision,
               'Recall': recall,
               'F1 Score': fscore,
               'Memory (MB)': mem_usage,
               'Execution Time (s)': execution_time
              })

    # Aggregating metrics
    aggregate_metrics['Precision'].append(precision)
    aggregate_metrics['Recall'].append(recall)
    aggregate_metrics['F1 Score'].append(fscore)
    aggregate_metrics['Memory (MB)'].append(mem_usage)
    aggregate_metrics['Execution Time (s)'].append(execution_time)

# Final Aggregation
for metric, values in aggregate_metrics.items():
    wandb.log({f'Aggregate {metric}': np.mean(values)})

wandb.finish()

# Save the last trained Extra Trees model
filename = 'model_weights/extra_trees_model.joblib'
joblib.dump(model, filename)


### Decision trees classifier
The Decision Trees Classifier is a versatile and interpretable machine learning algorithm that classifies data by splitting it based on feature values, creating a tree-like model of decisions and their possible consequences.

In [31]:
import wandb
import joblib
import numpy as np
import time
from sklearn.metrics import precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier
from memory_profiler import memory_usage

# Initialize Weights & Biases
wandb.init(project='seed_generation', notes='Decision Tree training', tags=['decision_tree', 'seed generation'])

datasets = [(S1_X_train, S1_X_test, S1_y_train, S1_y_test),
            (S2_X_train, S2_X_test, S2_y_train, S2_y_test),
            (S3_X_train, S3_X_test, S3_y_train, S3_y_test),
            (S4_X_train, S4_X_test, S4_y_train, S4_y_test)]

aggregate_metrics = {'Precision': [], 'Recall': [], 'F1 Score': [], 'Memory (MB)': [], 'Execution Time (s)': []}

for X_train, X_test, y_train, y_test in datasets:
    # Convert y_train and y_test to 1D array
    y_train = y_train.squeeze()
    y_test = y_test.squeeze()

    start_time = time.time()

    # Measure memory usage
    mem_usage_start = memory_usage(-1)
    model = DecisionTreeClassifier(max_depth=10, random_state=0).fit(X_train, y_train)
    mem_usage_end = memory_usage(-1)
    mem_usage = abs(mem_usage_end[0] - mem_usage_start[0])

    predicted = model.predict(X_test)
    execution_time = time.time() - start_time

    # Metrics Calculation
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, predicted, average='weighted')

    # Log metrics to wandb
    wandb.log({'Training score': model.score(X_train, y_train) * 100,
               'Test score': model.score(X_test, y_test) * 100,
               'Precision': precision,
               'Recall': recall,
               'F1 Score': fscore,
               'Memory (MB)': mem_usage,
               'Execution Time (s)': execution_time
              })

    # Aggregating metrics
    aggregate_metrics['Precision'].append(precision)
    aggregate_metrics['Recall'].append(recall)
    aggregate_metrics['F1 Score'].append(fscore)
    aggregate_metrics['Memory (MB)'].append(mem_usage)
    aggregate_metrics['Execution Time (s)'].append(execution_time)

# Final Aggregation
for metric, values in aggregate_metrics.items():
    wandb.log({f'Aggregate {metric}': np.mean(values)})

wandb.finish()

# Save the last trained Decision Tree model
filename = 'model_weights/decision_tree_model.joblib'
joblib.dump(model, filename)


## Naïve Bayes
Naïve Bayes algorithms represent a family of simple yet effective probabilistic classifiers based on applying Bayes' theorem with strong (naïve) independence assumptions between the features. Widely used in various applications, they are particularly known for their efficiency and ease of implementation.

#### Key Concepts

1. **Bayesian Theory**: The core principle of Naïve Bayes is Bayes' theorem, which describes the probability of a feature, based on prior knowledge of conditions that might be related to that feature.

2. **Feature Independence**: Naïve Bayes classifiers assume that the value of a particular feature is independent of the value of any other feature, given the class variable. This assumption simplifies the computation, hence the term "naïve."

3. **Variants of Naïve Bayes**:
   - **Gaussian Naïve Bayes**: Assumes that the continuous values associated with each class are distributed according to a Gaussian distribution.
   - **Multinomial Naïve Bayes**: Typically used for document classification, where the features are the frequencies of the words or tokens.
   - **Bernoulli Naïve Bayes**: Used in binary classification, especially text classification with 'bag of words' model.

4. **Applications**: Naïve Bayes classifiers are widely used in spam filtering, sentiment analysis, and document classification. They are also employed in medical diagnosis and weather prediction.

5. **Strengths and Limitations**:
   - **Strengths**: They are easy to implement, can handle both continuous and discrete data, and perform well in multi-class prediction. When the independence assumption holds, a Naïve Bayes classifier performs better compared to other models and requires much less training data.
   - **Limitations**: Their strong feature independence assumptions can lead to poor performance if this assumption does not hold. In practice, they are often outperformed by models like Random Forest or Gradient Boosting.

#### Practical Implementation

Implementing Naïve Bayes models involves careful preprocessing of data. For text data, techniques like bag-of-words or TF-IDF are common. Feature scaling is not required as the classifiers are not sensitive to the magnitude of data. Tuning involves choosing the right variant of Naïve Bayes and adjusting parameters like the smoothing factor in Multinomial and Bernoulli Naïve Bayes.

In summary, Naïve Bayes classifiers, with their basis in probability theory, offer a straightforward and efficient approach for building fast and scalable machine learning models. Their simplicity and the ability to make probabilistic predictions make them useful, especially in the initial stages of a modeling pipeline.


## Gaussian naive bayes classifier

The Gaussian Naive Bayes classifier is a probabilistic machine learning model particularly suited for continuous data, assuming that the features of each class are normally distributed.

In [70]:
import wandb
import joblib
import numpy as np
import time
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import GaussianNB
from memory_profiler import memory_usage

# Initialize Weights & Biases
wandb.init(project='seed_generation', notes='Gaussian Naive Bayes training', tags=['gaussian_nb', 'seed generation'])

datasets = [(S1_X_train, S1_X_test, S1_y_train, S1_y_test),
            (S2_X_train, S2_X_test, S2_y_train, S2_y_test),
            (S3_X_train, S3_X_test, S3_y_train, S3_y_test),
            (S4_X_train, S4_X_test, S4_y_train, S4_y_test)]

aggregate_metrics = {'Precision': [], 'Recall': [], 'F1 Score': [], 'Memory (MB)': [], 'Execution Time (s)': []}

for X_train, X_test, y_train, y_test in datasets:
    # Convert y_train and y_test to 1D array
    y_train = y_train.squeeze()
    y_test = y_test.squeeze()

    start_time = time.time()

    # Measure memory usage
    mem_usage_start = memory_usage(-1)
    model = GaussianNB().fit(X_train, y_train)
    mem_usage_end = memory_usage(-1)
    mem_usage = mem_usage_end[0] - mem_usage_start[0]

    predicted = model.predict(X_test)
    execution_time = time.time() - start_time

    # Metrics Calculation
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, predicted, average='weighted')

    # Log metrics to wandb
    wandb.log({'Training score': model.score(X_train, y_train) * 100,
               'Test score': model.score(X_test, y_test) * 100,
               'Precision': precision,
               'Recall': recall,
               'F1 Score': fscore,
               'Memory (MB)': mem_usage,
               'Execution Time (s)': execution_time
              })

    # Aggregating metrics
    aggregate_metrics['Precision'].append(precision)
    aggregate_metrics['Recall'].append(recall)
    aggregate_metrics['F1 Score'].append(fscore)
    aggregate_metrics['Memory (MB)'].append(mem_usage)
    aggregate_metrics['Execution Time (s)'].append(execution_time)

# Final Aggregation
for metric, values in aggregate_metrics.items():
    wandb.log({f'Aggregate {metric}': np.mean(values)})

wandb.finish()

# Save the last trained Gaussian Naive Bayes model
filename = 'model_weights/gaussian_nb_model.joblib'
joblib.dump(model, filename)


VBox(children=(Label(value='0.001 MB of 0.025 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.047615…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

MemoryError: Unable to allocate 10.3 MiB for an array with shape (223979, 6) and data type float64

## Gradient Boosting Algorithms
Gradient Boosting Algorithms are a group of powerful machine learning techniques that build predictive models in the form of an ensemble of weak prediction models, typically decision trees. They are known for their effectiveness in handling various types of data and their ability to improve the accuracy of predictions by reducing bias and variance.

#### Key Concepts

1. **Sequential Model Building**: Unlike other techniques that build models in parallel, gradient boosting builds one tree at a time, where each new tree helps to correct errors made by the previously trained tree.

2. **Loss Function Optimization**: These algorithms focus on minimizing a loss function iteratively. Each new model incrementally decreases the loss function of the entire system using the gradient descent method.

3. **Types of Gradient Boosting**:
   - **Gradient Boosting Machines (GBM)**: The traditional form of gradient boosting that sequentially adds predictors and corrects previous models.
   - **XGBoost (Extreme Gradient Boosting)**: An optimized distributed gradient boosting library designed to be highly efficient, flexible, and portable.
   - **LightGBM**: A gradient boosting framework that uses tree-based learning algorithms, designed for distributed and efficient training, particularly on large datasets.
   - **CatBoost**: An algorithm that can handle categorical data naturally and is robust to overfitting, making it particularly effective for a wide range of data science problems.

4. **Applications**: Gradient boosting models are used for a wide range of applications, including but not limited to ranking (like search engines), classification, regression, and many other machine learning tasks where high accuracy is desired.

5. **Strengths and Limitations**:
   - **Strengths**: They are highly accurate, can handle different types of data, and provide feature importance scores, which can be insightful for model interpretation.
   - **Limitations**: These models can be prone to overfitting if not tuned properly and are computationally more expensive than simpler models. They also require careful tuning of parameters and aren't as easy to interpret as simpler models.

#### Practical Implementation

Implementing gradient boosting models involves careful tuning of parameters like the number of trees, depth of trees, learning rate, and subsample ratio. The choice and tuning of the loss function are also crucial, depending on the specific problem. Due to their complexity, gradient boosting models often require more computational resources and time to train, especially on large datasets.

In summary, Gradient Boosting Algorithms are highly effective for complex machine learning problems where predictive accuracy is paramount. Their ability to iteratively correct errors and optimize performance makes them a go-to choice for competitive data science and a wide range of business applications.


### LGBM classifier
LightGBM (Light Gradient Boosting Machine) is an efficient and scalable implementation of gradient boosting that excels in handling large datasets and high-dimensional features, due to its novel approach of building trees leaf-wise rather than level-wise.

In [31]:
from lightgbm import LGBMClassifier


# Initialize Weights & Biases
wandb.init(project='seed_generation', notes='LightGBM training', tags=['lgbm', 'seed generation'])

datasets = [(S1_X_train, S1_X_test, S1_y_train, S1_y_test),
            (S2_X_train, S2_X_test, S2_y_train, S2_y_test),
            (S3_X_train, S3_X_test, S3_y_train, S3_y_test),
            (S4_X_train, S4_X_test, S4_y_train, S4_y_test)]

aggregate_metrics = {'Precision': [], 'Recall': [], 'F1 Score': [], 'Memory (MB)': [], 'Execution Time (s)': []}

for X_train, X_test, y_train, y_test in datasets:
    # Convert y_train and y_test to 1D array
    y_train = y_train.squeeze()
    y_test = y_test.squeeze()

    start_time = time.time()

    # Measure memory usage
    mem_usage_start = memory_usage(-1)
    model = LGBMClassifier().fit(X_train, y_train)
    mem_usage_end = memory_usage(-1)
    mem_usage = mem_usage_end[0] - mem_usage_start[0]

    predicted = model.predict(X_test)
    execution_time = time.time() - start_time

    # Metrics Calculation
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, predicted, average='weighted')

    # Log metrics to wandb
    wandb.log({'Training score': model.score(X_train, y_train) * 100,
               'Test score': model.score(X_test, y_test) * 100,
               'Precision': precision,
               'Recall': recall,
               'F1 Score': fscore,
               'Memory (MB)': mem_usage,
               'Execution Time (s)': execution_time
              })

    # Aggregating metrics
    aggregate_metrics['Precision'].append(precision)
    aggregate_metrics['Recall'].append(recall)
    aggregate_metrics['F1 Score'].append(fscore)
    aggregate_metrics['Memory (MB)'].append(mem_usage)
    aggregate_metrics['Execution Time (s)'].append(execution_time)

# Final Aggregation
for metric, values in aggregate_metrics.items():
    wandb.log({f'Aggregate {metric}': np.mean(values)})

wandb.finish()

# Save the last trained LightGBM model
filename = 'model_weights/lgbm_model.joblib'
joblib.dump(model, filename)


OSError: exception: access violation writing 0x0000000000000000

### XGB Classifier
The XGBoost (Extreme Gradient Boosting) Classifier is a highly efficient and scalable implementation of gradient boosting known for its performance and speed, often delivering state-of-the-art results in a wide range of machine learning tasks.

In [None]:
from xgboost import XGBClassifier

# Initialize Weights & Biases
wandb.init(project='seed_generation', notes='XGBoost training', tags=['xgboost', 'seed generation'])

datasets = [(S1_X_train, S1_X_test, S1_y_train, S1_y_test),
            (S2_X_train, S2_X_test, S2_y_train, S2_y_test),
            (S3_X_train, S3_X_test, S3_y_train, S3_y_test),
            (S4_X_train, S4_X_test, S4_y_train, S4_y_test)]

aggregate_metrics = {'Precision': [], 'Recall': [], 'F1 Score': [], 'Memory (MB)': [], 'Execution Time (s)': []}

for X_train, X_test, y_train, y_test in datasets:
    # Convert y_train and y_test to 1D array
    y_train = y_train.squeeze()
    y_test = y_test.squeeze()

    start_time = time.time()

    # Measure memory usage
    mem_usage_start = memory_usage(-1)
    model = XGBClassifier().fit(X_train, y_train)
    mem_usage_end = memory_usage(-1)
    mem_usage = mem_usage_end[0] - mem_usage_start[0]

    predicted = model.predict(X_test)
    execution_time = time.time() - start_time

    # Metrics Calculation
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, predicted, average='weighted')

    # Log metrics to wandb
    wandb.log({'Training score': model.score(X_train, y_train) * 100,
               'Test score': model.score(X_test, y_test) * 100,
               'Precision': precision,
               'Recall': recall,
               'F1 Score': fscore,
               'Memory (MB)': mem_usage,
               'Execution Time (s)': execution_time
              })

    # Aggregating metrics
    aggregate_metrics['Precision'].append(precision)
    aggregate_metrics['Recall'].append(recall)
    aggregate_metrics['F1 Score'].append(fscore)
    aggregate_metrics['Memory (MB)'].append(mem_usage)
    aggregate_metrics['Execution Time (s)'].append(execution_time)

# Final Aggregation
for metric, values in aggregate_metrics.items():
    wandb.log({f'Aggregate {metric}': np.mean(values)})

wandb.finish()

# Save the last trained XGBoost model
filename = 'model_weights/xgboost_model.joblib'
joblib.dump(model, filename)
