In [1]:
# !sudo apt-get install build-essential swig
# !curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install
# !pip install auto-sklearn
# !sudo apt-get install python3.10

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from autosklearn import classification
import pickle
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import os.path as p
from glob import glob

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
model_path = '/content/drive/My Drive/Colab Notebooks/Models'
TEST_SIZE = 0.05

def predict(X, y, data_title):
  """Split X and y to trains and test datasets.
    Fit and save the model(with parameters) to the local machine
    Args:
        X (numpy or Dataframe): Independent dataset.
        y (numpy or Dataframe): Dependent dataset.
        test_size: Ratio of test set against the whole dataset.
  """
  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=TEST_SIZE)
  classifier = classification.AutoSklearnClassifier(
    time_left_for_this_task=1500,
    per_run_time_limit=30,
    memory_limit=3e5
  )
  classifier.fit(X_train.copy(), y_train.copy())
  # save model
  with open(f'{model_path}/{data_title}.pkl', 'wb') as f:
      pickle.dump(classifier, f)

def save_confusion_matrix(data, labels, output_filename):
  """Plot confusion matrix using heatmap.

  Args:
      data (list of list): List of lists with confusion matrix data.
      labels (list): Labels which will be plotted across x and y axis.
      output_filename (str): Path to output file.

  """
  sb.set(color_codes=True)
  plt.figure(1, figsize=(9, 6))
  plt.title("Confusion Matrix")

  sb.set(font_scale=1.4)
  ax = sb.heatmap(data, annot=True, cmap="YlGnBu", cbar_kws={'label': 'Scale'})

  ax.set_xticklabels(labels)
  ax.set_yticklabels(labels)

  ax.set(ylabel="True Label", xlabel="Predicted Label")

  plt.savefig(output_filename, bbox_inches='tight', dpi=300)
  plt.close()

In [8]:
data_path = "/content/drive/My Drive/Colab Notebooks/Data/Engineered/*"
conf_path = "/content/drive/My Drive/Colab Notebooks/Confusion_matrix"
for data in glob(data_path):
  data_title = p.split(data)[1].split('.')[0]
  # Read the cleaned data
  df = pd.read_parquet(data)
  y = df['target']
  X = df.drop(columns=['target'])
  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=TEST_SIZE)
  # predict(X, y, data_title)

  # load model
  with open(f'{model_path}/{data_title}.pkl', 'rb') as f:
      classifier = pickle.load(f)
  _, X_test, _, y_test = train_test_split(X, y, random_state=1, test_size=TEST_SIZE)
  # evaluate
  y_pred = classifier.predict(X_test)
  save_confusion_matrix(data=confusion_matrix(y_test, y_pred, normalize='true'),
                        labels=y.unique(),
                        output_filename=f'{conf_path}/{data_title}.png')
