# Libraries

In [None]:
!pip install pycobra

Collecting pycobra
  Downloading pycobra-0.2.5-py3-none-any.whl (26 kB)
Installing collected packages: pycobra
Successfully installed pycobra-0.2.5


In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import partial 

from sklearn import datasets
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import AdaBoostClassifier

from imblearn.under_sampling import NearMiss
from pycobra.classifiercobra import ClassifierCobra

# Undersampling Algorithms

## Type - 1: Selecting instances of the Majority Class to keep in the final Training Dataset

### 1. Near Miss Alogrithm - v1

In [None]:
def near_miss_v1(X, y, majority_class, k = 3, ratio = 4.5, knn_algorithm = 'auto', knn_metric = 'euclidean'):
  n, _ = X.shape
  verdict = np.ones(n, dtype=bool)

  # Step - 1: Find k nearest neighbors (in minority class) of all the data points
  X_minority = []
  for idx, val in enumerate(X):
    if y[idx] != majority_class:
      X_minority.append(X[idx])
      
  nearest_neighbors = NearestNeighbors(n_neighbors = k, algorithm = knn_algorithm, metric = knn_metric).fit(X_minority)
  distances, nearest_neighbors_idx = nearest_neighbors.kneighbors(X)
  
  # Step - 2: Compute the average distance
  distances = np.mean(distances, axis = 1)
  
  # Step - 3: Select the points from majority class depending on the 'ratio' value
  X_majority_info = []  # stores a 2D value, where val at idx1 is the index in the original data and val at idx2 is the mean distance
  for i in range(n):
    class_label = y[i]
    if class_label == majority_class:
      X_majority_info.append([i, distances[i]])

  majority_size = len(X_majority_info)
  X_majority_info.sort(key=lambda x: x[1])
  req_count = min(majority_size, (int)(ratio * len(X_minority)))
  
  print("[Testing]: Count of majority samples after undersampling vs Count of minority samples = ", req_count, "vs", len(X_minority), "or", req_count * 100/len(X_minority), "%")
  for i in range(req_count, majority_size):
    verdict[X_majority_info[i][0]] = False  

  return verdict

### 2. Near Miss Algorithm - v2

In [None]:
def near_miss_v2(X, y, majority_class, k = 3, ratio = 4.5, knn_algorithm = 'auto', knn_metric = 'euclidean'):
  n, _ = X.shape
  verdict = np.ones(n, dtype=bool)

  # Step - 1: Find k farthest neighbors (in minority class) of all the data points
  X_minority = []
  for idx, val in enumerate(X):
    if y[idx] != majority_class:
      X_minority.append(X[idx])
      
  distances = []
  for idx, maj_val in enumerate(X):
    if idx % 500 == 0:
      print("[Executing]: Processing data point - {} while undersampling...".format(idx + 1))

    if y[idx] != majority_class:
      distances.append([0] * k)    # To allow executing 'np.mean(distances, axis = 1)'
    else:
      current_distances = []
      for min_val in X_minority:
        current_distances.append(np.linalg.norm(maj_val - min_val))
      current_distances.sort(reverse = True)

      distances.append(current_distances[: min(k, len(X_minority))])
  
  # Step - 2: Compute the average distance
  distances = np.mean(distances, axis = 1)
  
  # Step - 3: Select the points from majority class depending on the 'ratio' value
  X_majority_info = []  # stores a 2D value, where val at idx1 is the index in the original data and val at idx2 is the mean distance
  for i in range(n):
    class_label = y[i]
    if class_label == majority_class:
      X_majority_info.append([i, distances[i]])

  majority_size = len(X_majority_info)
  X_majority_info.sort(key=lambda x: x[1])
  req_count = min(majority_size, (int)(ratio * len(X_minority)))
  
  print("[Testing]: Count of majority samples after undersampling vs Count of minority samples = ", req_count, "vs", len(X_minority), "or", req_count * 100/len(X_minority), "%")
  for i in range(req_count, majority_size):
    verdict[X_majority_info[i][0]] = False  

  return verdict

### 3. Near Miss Algorithm - v3

In [None]:
def near_miss_v3(X, y, majority_class, ratio = 3.5, knn_algorithm = 'auto', knn_metric = 'euclidean'):
  n, _ = X.shape
  verdict = np.ones(n, dtype=bool)

  # Step - 1: Find k farthest neighbors (in minority class) of all the data points
  X_minority = []
  for idx, val in enumerate(X):
    if y[idx] != majority_class:
      X_minority.append(X[idx])
      
  distances = []
  for idx, maj_val in enumerate(X):
    if idx % 500 == 0:
      print("[Executing]: Processing data point - {} while undersampling...".format(idx + 1))

    if y[idx] != majority_class:
      distances.append([0] * len(X_minority))    # To allow executing 'np.mean(distances, axis = 1)'
    else:
      current_distances = []
      for min_val in X_minority:
        current_distances.append(np.linalg.norm(maj_val - min_val))
      current_distances.sort(reverse = True)

      distances.append(current_distances)
  
  # Step - 2: Compute the average distance
  distances = np.mean(distances, axis = 1)
  
  # Step - 3: Select the points from majority class depending on the 'ratio' value
  X_majority_info = []  # stores a 2D value, where val at idx1 is the index in the original data and val at idx2 is the mean distance
  for i in range(n):
    class_label = y[i]
    if class_label == majority_class:
      X_majority_info.append([i, distances[i]])

  majority_size = len(X_majority_info)
  X_majority_info.sort(key=lambda x: x[1])
  req_count = min(majority_size, (int)(ratio * len(X_minority)))
  
  print("[Testing]: Count of majority samples after undersampling vs Count of minority samples = ", req_count, "vs", len(X_minority), "or", req_count * 100/len(X_minority), "%")
  for i in range(req_count, majority_size):
    verdict[X_majority_info[i][0]] = False  

  return verdict

### 4. Condensed KNN

In [None]:
def condensed_knn(X, y, majority_class, knn_algorithm = 'auto', knn_metric = 'euclidean'):
  n, _ = X.shape
  verdict = np.zeros(n, dtype=bool)

  # Step - 1: Find k nearest neighbors (in minority class) of all the data points
  store, store_y, grabstore, grabstore_y, s_idx, g_idx = [], [], [], [], [], []
  X_minority, X_majority = [], []

  for idx, val in enumerate(X):
    if y[idx] != majority_class:
      X_minority.append(X[idx])
      s_idx.append(idx)
      store.append(X[idx])
      store_y.append(y[idx])
    else:
      X_majority.append(X[idx])
      g_idx.append(idx)
      grabstore.append(X[idx])
      grabstore_y.append(y[idx])
    
  # Step - 2: Compute the average distance
  cnt = 100
  while((cnt != 0) and (len(grabstore)!=0)):
    cnt = 0
    i = 0
    for sn, i in enumerate(np.array(grabstore)):
      tmp = []
      nearest_neighbors = NearestNeighbors(n_neighbors = 1, algorithm = knn_algorithm, metric = knn_metric).fit(store)
      distances, nearest_neighbors_idx = nearest_neighbors.kneighbors(i.reshape(1,-1))
      if(store_y[nearest_neighbors_idx[0][0]] != majority_class):
        store.append(i)
        store_y.append(grabstore_y[sn])
        s_idx.append(g_idx[sn])
        tmp.append(sn)
        cnt += 1
    for z in tmp:
      del(grabstore[z])
      del(grabstore_y[z])
      del(g_idx[z])   
  
  for i in s_idx:
    verdict[i] = True

  return verdict

## Type - 2: Selecting instances of the Majority Class to delete from the final Training Dataset

### 1. KNN Und

In [None]:
def knn_und(X, y, majority_class, k = 50, t = 2, knn_algorithm = 'auto', knn_metric = 'euclidean'):
  n, _ = X.shape
  verdict = np.ones(n, dtype=bool)

  # Step - 1: Find k nearest neighbors of all the data points 
  nearest_neighbors = NearestNeighbors(n_neighbors = k, algorithm = knn_algorithm, metric = knn_metric).fit(X)
  _, nearest_neighbors_idx = nearest_neighbors.kneighbors(X)

  for i in range(n):
    # Step - 2: Identify the class label of X_i
    class_label = y[i]

    # Step - 3: Proceed only with majority class data points
    if class_label != majority_class:
      continue

    # Step - 4: Identify the count of minority class neighbors for the X_i
    minority_class_neighbors_count = 0
    neighbors_class_label = y[nearest_neighbors_idx[i]]

    for j in neighbors_class_label:
      if j != majority_class:
        minority_class_neighbors_count += 1

    # Step - 5: Mark the data point as False if count of minority class neighbors >= t
    if minority_class_neighbors_count >= t:
      verdict[i] = False

  return verdict

###2. ENN

In [None]:
def enn(X, y, majority_class, k = 5, knn_algorithm = 'auto', knn_metric = 'euclidean'):
  n, _ = X.shape
  verdict = np.ones(n, dtype=bool)

  # Step - 1: Find k nearest neighbors of all the data points 
  nearest_neighbors = NearestNeighbors(n_neighbors = k, algorithm = knn_algorithm, metric = knn_metric).fit(X)
  _, nearest_neighbors_idx = nearest_neighbors.kneighbors(X)

  # Step - 2: Delete elements from Majority Class
  for i in range(n):
    if(y[i] == majority_class):
      tmp = 0
      for j in nearest_neighbors_idx[i]:
        tmp += y[j]
      tmp/=3
      if(tmp<0.6):
        verdict[i] = False
    else:
      tmp = 0
      for j in nearest_neighbors_idx[i]:
        tmp += y[j]
      tmp/=3
      if(tmp>0.6):    
        for j in nearest_neighbors_idx[i]:
          if(y[j] == majority_class):
            verdict[j] == False

  return verdict            

### 3. Tomek Links

In [None]:
def tomek_link(X, y, majority_class, knn_algorithm = 'auto', knn_metric = 'euclidean'):
  n, _ = X.shape
  verdict = np.ones(n, dtype=bool)

  # Step - 1: Find k nearest neighbors of all the data points (k = 1)
  nearest_neighbors = NearestNeighbors(n_neighbors = 1, algorithm = knn_algorithm, metric = knn_metric).fit(X)
  _, nearest_neighbors_idx = nearest_neighbors.kneighbors(X)

  for i in range(n):
    # Step - 2: Identify the class label of X_i
    class_label = y[i]

    # Step - 3: Proceed only with majority class data points
    if class_label != majority_class:
      continue

    # Step - 4: Remove the data point if it is part of a Tomek Link
    neighbor_class_label = y[nearest_neighbors_idx[i][0]]
    if neighbor_class_label != majority_class:
      verdict[i] = False

  return verdict

# Models

## Parent Model

In [None]:
def execute_model(X, y, num_splits, seed, model, with_undersampling = False, majority_class = 0, undersampling_method = knn_und):
  K_folds = StratifiedKFold(n_splits = num_splits, shuffle = True, random_state = seed)
  metrics_list = []
  
  # Feature Scaling
  sc = StandardScaler()
  X = sc.fit_transform(X)

  iterations = 1
  for train_idx, test_idx in K_folds.split(X, y):
    print("\n****************  Executing iteration - {} of KFold Data split  ****************".format(iterations))

    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # Execute Undersampling on training data
    if with_undersampling == True:
      print("[Testing]: Count of test data before Undersampling = ", X_train.shape[0])
      verdict = undersampling_method(X_train, y_train, majority_class)

      X_train = X_train[verdict, :]
      y_train = y_train[verdict]

      # In-buit near miss algorithm
      # nr = NearMiss()
      # X_train, y_train = nr.fit_sample(X_train, y_train)

      # Note: Be careful while plotting, make sure same features are being compared
      # plt.scatter(X_train[:, 0], X_train[:, 1], marker = '.', c = y_train)
      # plt.show()

      print("[Testing]: Count of test data after Undersampling = ", X_train.shape[0])

    # Model Fitting & Predictions on test dataset
    y_pred = model(X_train, y_train, X_test)

    # Evaluation Metrics
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision, recall, F1_score, _ = metrics.precision_recall_fscore_support(y_test, y_pred, beta = 1.0, average = 'macro')
    metrics_list.append([accuracy, precision, recall, F1_score])

    iterations += 1
  
  metrics_list = np.mean(metrics_list, axis = 0)

  print("\n---------------  Cross-validated Evaluation Metrics  ---------------\n")
  print("Accuracy \t= \t", metrics_list[0])
  print("Precision \t= \t", metrics_list[1])
  print("Recall \t\t= \t", metrics_list[2])
  print("F1 score \t= \t", 2 * metrics_list[1] * metrics_list[2] / (metrics_list[1] + metrics_list[2]))

## 1. Logistic Regression

In [None]:
def logistic_regression(X_train, y_train, X_test):
  print("[Executing]: Running Logistic Regression model ...\n")

  # Model Fitting
  model = LogisticRegression(random_state = 0, solver='lbfgs', multi_class='multinomial')
  model.fit(X_train, y_train)

  # Predictions on test dataset
  y_pred = model.predict(X_test)
  
  return y_pred

##2. AdaBoost

In [None]:
# Decision stump used as weak classifier
class DecisionStump:
  def __init__(self):
    self.polarity = 1
    self.feature_idx = None
    self.threshold = None
    self.alpha = None

  def predict(self, X):
    n_samples = X.shape[0]
    X_column = X[:, self.feature_idx]
    predictions = np.ones(n_samples)
    if self.polarity == 1:
      predictions[X_column < self.threshold] = -1
    else:
      predictions[X_column > self.threshold] = -1

    return predictions


class Adaboost:
  def __init__(self, n_clf=5):
    self.n_clf = n_clf
    self.clfs = []

  def fit(self, X, y):
    n_samples, n_features = X.shape

    # Initialize weights to 1/N
    w = np.full(n_samples, (1 / n_samples))

    self.clfs = []

    # Iterate through classifiers
    for _ in range(self.n_clf):
      clf = DecisionStump()
      min_error = float("inf")

      # greedy search to find best threshold and feature
      for feature_i in range(n_features):
        X_column = X[:, feature_i]
        thresholds = np.unique(X_column)

        for threshold in thresholds:
          # predict with polarity 1
          p = 1
          predictions = np.ones(n_samples)
          predictions[X_column < threshold] = -1

          # Error = sum of weights of misclassified samples
          misclassified = w[y != predictions]
          error = sum(misclassified)

          if error > 0.5:
            error = 1 - error
            p = -1

          # store the best configuration
          if error < min_error:
            clf.polarity = p
            clf.threshold = threshold
            clf.feature_idx = feature_i
            min_error = error

      # calculate alpha
      EPS = 1e-10
      clf.alpha = 0.5 * np.log((1.0 - min_error + EPS) / (min_error + EPS))

      # calculate predictions and update weights
      predictions = clf.predict(X)

      w *= np.exp(-clf.alpha * y * predictions)
      # Normalize to one
      w /= np.sum(w)

      # Save classifier
      self.clfs.append(clf)


  def predict(self, X):
    clf_preds = [clf.alpha * clf.predict(X) for clf in self.clfs]
    y_pred = np.sum(clf_preds, axis=0)
    y_pred = np.sign(y_pred)

    return y_pred

In [None]:
def adaboost_classifier(X_train, y_train, X_test):
  print("[Executing]: Running Adaboost ...\n")

  # Model Fitting
  # model = Adaboost()
  model = AdaBoostClassifier()
  model.fit(X_train, y_train)

  # Predictions on test dataset
  y_pred = model.predict(X_test)

  return y_pred

## 3. COBRA

**References**


1.   https://github.com/bhargavvader/pycobra
2.   https://github.com/bhargavvader/personal/tree/master/notebooks/pycobra



### a. In-built

In [None]:
def cobra_classifier(X_train, y_train, X_test):
  print("[Executing]: Running Cobra Model ...\n")

  # Model Fitting
  model = ClassifierCobra(machine_list = 'basic')  
  # advanced = ['tree', 'knn', 'svm', 'logreg', 'naive_bayes', 'lda', 'neural_network']
  # basic = ['sgd', 'tree', 'knn', 'svm']
  
  model.fit(X_train, y_train)

  # Predictions on test dataset
  y_pred = model.predict(X_test)

  return y_pred

###b. Scratch

In [None]:
from sklearn.utils import shuffle

from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn import neighbors, tree, svm
from sklearn.base import BaseEstimator
from sklearn.naive_bayes import GaussianNB

import numpy as np


class CobraClassifier:
    def __init__(self, seed=42, epsilon=0.001, threshold=0.5, machines=None):
        self.seed = seed
        self.epsilon = epsilon
        self.threshold = threshold
        self.machines = machines


    def fit(self, X, y, X_k=None, y_k=None, X_l=None, y_l=None, flag=False):
        # if flag == True => X_k values are defined
        self.X, self.y = X, y
        self.X_k, self.y_k = X_k, y_k
        self.X_l, self.y_l = X_l, y_l
        
        self.machine_estimators = {}

        if flag == False:
            self.generate_training_data()

        # Train machine estimators on the training data (D_k)
        self.setup_machines()
        self.execute_machines()

        return self


    def predict_helper(self, X, alpha):
        res = {}
        for m in self.machines:
            predicted_label = self.machine_estimators[m].predict(X)
            res[m] = {}

            for idx in range(len(self.X_l)):
                if math.fabs(self.machine_predictions[m][idx] - predicted_label) <= self.epsilon:
                    res[m][idx] = 1
                else:
                    res[m][idx] = 0
        
        filtered_points = []
        for idx in range(0, len(self.X_l)):
            sum = 0
            for m in res:
                if res[m][idx] == 1:
                    sum += 1
                if sum >= alpha:
                    filtered_points.append(idx)
                    break

        if len(filtered_points) == 0:
            return 0

        score = 0
        for idx in filtered_points:
            score += self.y_l[idx]
        score = score / len(filtered_points)
        
        final_label = 1 if score >= self.threshold else 0
        return final_label


    def predict(self, X, alpha=None):
        n = len(X)
        predicted_labels = np.zeros(n)
        
        if alpha is None:
            alpha = len(self.machines)

        for i in range(n):
            predicted_labels[i] = self.predict_helper(X[i].reshape(1, -1), alpha)
        
        return predicted_labels


    def set_epsilon():
        pass


    def setup_machines(self):
        for m in self.machines:
            if m == 'knn':
                self.machine_estimators[m] = neighbors.KNeighborsClassifier().fit(self.X_k, self.y_k)
            elif m == 'random_forest':
                self.machine_estimators[m] = RandomForestClassifier(random_state=self.seed).fit(self.X_k, self.y_k)
            elif m == 'logistic_regression':
                self.machine_estimators[m] = LogisticRegression(random_state=self.seed).fit(self.X_k, self.y_k)
            elif m == 'svm':
                self.machine_estimators[m] = svm.SVC().fit(self.X_k, self.y_k)
            elif m == 'decision_trees':
                self.machine_estimators[m] = tree.DecisionTreeClassifier().fit(self.X_k, self.y_k)
            elif m == 'naive_bayes':
                self.machine_estimators[m] = GaussianNB().fit(self.X_k, self.y_k)
            elif m == 'stochastic_gradient_decision':
                self.machine_estimators[m] = SGDClassifier().fit(self.X_k, self.y_k)
            elif m == 'ridge':
                self.machine_estimators[m] = RidgeClassifier().fit(self.X_k, self.y_k)

        return self


    def execute_machines(self):
        self.machine_predictions = {}
  
        for m in self.machines:
            self.machine_predictions[m] = self.machine_estimators[m].predict(self.X_l)

        return self
    

    def generate_training_data(self, k=None, l=None):
        """ 
        Splits the data into training (D_k) and testing part (D_l) for execution of models as specified in the COBRA paper
        """

        if k is None or l is None:
            n = len(self.X)
            k = int(3*n/4)
            l = int(n/4)

        self.X_k, self.y_k = self.X[ : k], self.y[ : k]
        self.X_l, self.y_l = self.X[k : ], self.y[k : ]
        
        return self

In [None]:
def cobra_classifier_scratch(X_train, y_train, X_test):
  print("[Executing]: Running Cobra Model ...\n")

  # Model Fitting
  model = CobraClassifier(machines = ['knn', 'logistic_regression', 'svm', 'naive_bayes', 'ridge'])  
  model.fit(X_train, y_train)

  # Predictions on test dataset
  y_pred = model.predict(X_test)

  return y_pred

# Dataset

In [None]:
def prepare_data(seed, choice=1):
  if choice == 1:
    """ Dataset - 1 """
    # Ref - https://scikit-learn.org/stable/auto_examples/datasets/plot_random_dataset.html#sphx-glr-auto-examples-datasets-plot-random-dataset-py
    X, y = datasets.make_classification(n_samples = 1000, n_classes = 2, weights = [0.2, 0.8], class_sep = 0.9, 
                                  n_features = 5, n_redundant = 1, n_informative = 3, n_clusters_per_class = 1, random_state = seed)

  elif choice == 2:
    """ Dataset - 2 """
    dataset = pd.read_csv('./winequality-red.csv', sep=';')

    def reviews(row):
      if row['quality'] > 7:
          return 1
      else: 
          return 0

    dataset['reviews'] = dataset.apply(reviews, axis=1)
    dataset['reviews'].value_counts()

    features = list(dataset.columns)[:-1]
    target = 'reviews'
    X = np.asarray(dataset[features])
    y = np.asarray(dataset[target])

    majority_class_label = int(sum(y) > 0.5 * len(y))
    print("[Testing]: Majority class = ", majority_class_label, "\tSum of Target variable = ", sum(y), "\tLength of Target variable = ", len(y))
    # print(dataset.describe())

  elif choice == 3:
    """ Dataset - 3 """
    dataset = pd.read_csv('./winequality-white.csv', sep=';')

    def reviews(row):
      if row['quality'] > 7:
          return 1
      else: 
          return 0

    dataset['reviews'] = dataset.apply(reviews, axis=1)
    dataset['reviews'].value_counts()

    features = list(dataset.columns)[:-1]
    target = 'reviews'
    X = np.asarray(dataset[features])
    y = np.asarray(dataset[target])

    majority_class_label = int(sum(y) > 0.5 * len(y))
    print("[Testing]: Majority class = ", majority_class_label, "\tSum of Target variable = ", sum(y), "\tLength of Target variable = ", len(y))
    # print(dataset.describe())

  elif choice == 4:
    """ Dataset - 4 """
    # Ref - https://archive.ics.uci.edu/ml/datasets/car+evaluation
    dataset = pd.read_csv('./car.data', header=None, sep=',')
    
    """ Pre-processing """
    def create_target(row, val='vgood'):
      if row[6] == val:
          return 1
      else: 
          return 0

    def encode_attribute_1(row):
      if row[0] == 'vhigh':   return 1
      elif row[0] == 'high':  return 2
      elif row[0] == 'med':   return 3
      else:                   return 4
    
    def encode_attribute_2(row):
      if row[1] == 'vhigh':   return 1
      elif row[1] == 'high':  return 2
      elif row[1] == 'med':   return 3
      else:                   return 4

    def encode_attribute_3(row):
      if row[2] == '5more':   return 5
      else:                   return row[2]

    def encode_attribute_4(row):
      if row[3] == 'more':    return 5
      else:                   return row[3]

    def encode_attribute_5(row):
      if row[4] == 'small':   return 1
      elif row[4] == 'med':   return 2
      else:                   return 3

    def encode_attribute_6(row):
      if row[5] == 'low':     return 1
      elif row[5] == 'med':   return 2
      else:                   return 3

    dataset[0] = dataset.apply(encode_attribute_1, axis=1)
    dataset[1] = dataset.apply(encode_attribute_2, axis=1)
    dataset[2] = dataset.apply(encode_attribute_3, axis=1)
    dataset[3] = dataset.apply(encode_attribute_4, axis=1)
    dataset[4] = dataset.apply(encode_attribute_5, axis=1)
    dataset[5] = dataset.apply(encode_attribute_6, axis=1)
    dataset['target'] = dataset.apply(create_target, axis=1)
    
    """"""

    features = list(dataset.columns)[:-2]
    target = 'target'
    X = np.asarray(dataset[features])
    y = np.asarray(dataset[target])

    majority_class_label = int(sum(y) > 0.5 * len(y))
    print("[Testing]: Majority class = ", majority_class_label, "\tSum of Target variable = ", sum(y), "\tLength of Target variable = ", len(y))
    # print(dataset.describe())
    # print(dataset.head(10))

  elif choice == 5:
    """ Dataset - 5 """
    # Ref - https://archive.ics.uci.edu/ml/datasets/ecoli, https://www.kaggle.com/kannanaikkal/ecoli-uci-dataset
    dataset = pd.read_csv('./ecoli.csv')
    
    def create_target(row, val='imU'):
      if row['SITE'] == val:
          return 1
      else: 
          return 0

    dataset = dataset.drop('SEQUENCE_NAME', 1)
    # for s in set(dataset['SITE']):
    #   print(s, " = ", dataset['SITE'].str.count(s).sum())

    dataset['target'] = dataset.apply(create_target, axis=1)
    
    features = list(dataset.columns)[:-2]
    target = 'target'
    X = np.asarray(dataset[features])
    y = np.asarray(dataset[target])

    majority_class_label = int(sum(y) > 0.5 * len(y))
    print("[Testing]: Majority class = ", majority_class_label, "\tSum of Target variable = ", sum(y), "\tLength of Target variable = ", len(y))
    # print(dataset.describe())
    # print(dataset.head(10))

  elif choice == 6:
    """ Dataset - 6 """
    # Ref - https://archive.ics.uci.edu/ml/datasets/abalone
    dataset = pd.read_csv('./abalone.data', header=None, sep=',')
    
    def create_target(row, val=20):
      if row[8] >= val:
          return 1
      else: 
          return 0

    def encode_attribute(row):
      if row[0] == 'M':     return 0
      elif row[0] == 'F':   return 1
      else:                 return 2

    dataset[0] = dataset.apply(encode_attribute, axis = 1)
    dataset['target'] = dataset.apply(create_target, axis=1)
    dataset['target'].value_counts()

    features = list(dataset.columns)[:-2]
    target = 'target'
    X = np.asarray(dataset[features])
    y = np.asarray(dataset[target])

    majority_class_label = int(sum(y) > 0.5 * len(y))
    print("[Testing]: Majority class = ", majority_class_label, "\tSum of Target variable = ", sum(y), "\tLength of Target variable = ", len(y))
    # print(dataset.describe())
    # print(dataset.head(10))
    # print(dataset[8].value_counts())

  elif choice == 7:
    """ Dataset - 7 """
    # Ref - https://archive.ics.uci.edu/ml/datasets/nursery
    dataset = pd.read_csv('./nursery.data', header=None, sep=',')
    
    def create_target(row, val='very_recom'):
      if row[8] == val:
          return 1
      else: 
          return 0

    def encode_attributes(original_dataframe, feature_to_encode):
      dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
      res = pd.concat([original_dataframe, dummies], axis=1)
      res = res.drop([feature_to_encode], axis=1)
      return res

    features_to_encode = [0, 1, 2, 3, 4, 5, 6, 7]
    for feature in features_to_encode:
        dataset = encode_attributes(dataset, feature)

    dataset['target'] = dataset.apply(create_target, axis=1)

    features = list(dataset.columns)[1:-1]
    target = 'target'
    X = np.asarray(dataset[features])
    y = np.asarray(dataset[target])

    majority_class_label = int(sum(y) > 0.5 * len(y))
    print("[Testing]: Majority class = ", majority_class_label, "\tSum of Target variable = ", sum(y), "\tLength of Target variable = ", len(y))
    # print(dataset.describe())
    # print(dataset.head(10))
    # for s in set(dataset[8]):
    #   print(s, " = ", dataset[8].str.count(s).sum())

  else:
    print("No dataset available")

  # plt.scatter(X[:, 0], X[:, 1], marker = '.', c = y)
  # plt.show()
  # assert np.any(np.isnan(dataset)) == False
  
  return X, y, majority_class_label

# Driver Code

In [None]:
X, y, majority_class_label = prepare_data(32, choice=6)
num_splits, seed = 2, 32

# models = [cobra_classifier_scratch]

# for m in models:
#   print("\n\n#############################  MODEL -", m.__name__, "  #############################")
#   print("\n=======================  Executing without undersampling  =======================")
#   execute_model(X, y, num_splits, seed, m)

#   print("\n\n=======================  Executing with undersampling  =======================")
#   execute_model(X, y, num_splits, seed, m, with_undersampling = True, majority_class = majority_class_label, undersampling_method = near_miss_v1)
  

[Testing]: Majority class =  0 	Sum of Target variable =  62 	Length of Target variable =  4177


In [None]:
X, y, majority_class_label = prepare_data(32, choice=5)
num_splits, seed = 2, 32

models = [logistic_regression, adaboost_classifier, cobra_classifier_scratch]

for m in models:
  print("\n\n#############################  MODEL -", m.__name__, "  #############################")
  execute_model(X, y, num_splits, seed, m)  

[Testing]: Majority class =  0 	Sum of Target variable =  35 	Length of Target variable =  336


#############################  MODEL - logistic_regression   #############################

****************  Executing iteration - 1 of KFold Data split  ****************
[Executing]: Running Logistic Regression model ...


****************  Executing iteration - 2 of KFold Data split  ****************
[Executing]: Running Logistic Regression model ...


---------------  Cross-validated Evaluation Metrics  ---------------

Accuracy 	= 	 0.9226190476190477
Precision 	= 	 0.8058383609854198
Recall 		= 	 0.7415800978227935
F1 score 	= 	 0.7723750316758925


#############################  MODEL - adaboost_classifier   #############################

****************  Executing iteration - 1 of KFold Data split  ****************
[Executing]: Running Adaboost ...


****************  Executing iteration - 2 of KFold Data split  ****************
[Executing]: Running Adaboost ...


---------------  

  _warn_prf(average, modifier, msg_start, len(result))
