In [None]:
import pandas as pd
import numpy as np
import os
import random
from scipy import stats
import matplotlib.pyplot as plt
from matplotlib import dates as d
import datetime
from datetime import date, timedelta
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
import matplotlib.lines as mlines
import re
import seaborn as sns
from sklearn.cluster import KMeans
#$pip install kneed
from kneed import KneeLocator
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import recall_score

In [None]:
def isolation_forest_df(df, selected_features):

  df_yellow = df[df["Follow Up Priority"] == "Yellow"]
  df_green = df[df["Follow Up Priority"] == "Green"]

  isolation_test_df = pd.DataFrame(columns=['estimator', 'contamination', 'recall', 'matthews'])
  data = df.copy()[selected_features]

  estimators = np.linspace(100, 300, 5)
  contamination = np.linspace(.07, .09, 5)

  for e in (estimators):
    for c in (contamination):

        # Create and fit the model
        # estimators are the number of small decision trees
        # contatmination is proportion of outliers
      clf = IsolationForest(n_estimators= int(e), contamination=round(c, 3), random_state=random_state_val)
      clf.fit(data)

      # Predict outliers
      predictions = clf.predict(data)
      data['prediction'] = predictions

        # Examine the outliers
      outliers = data[data['prediction'] == -1]
      non_outliers = data[data['prediction'] == 1]

      anomalies = df.loc[outliers.index, ['Timestamp (US/Eastern)', 'Well']] # yellow prediction
      not_anomalies = df.loc[non_outliers.index, ['Timestamp (US/Eastern)', 'Well']] # green prediction

      true_positive = len(pd.merge(anomalies, df_yellow, on=['Timestamp (US/Eastern)', 'Well'], how='inner')) # model predicted yellow + was yellow
      true_negative = len(pd.merge(not_anomalies, df_green, on=['Timestamp (US/Eastern)', 'Well'], how='inner')) # model predicted green and it actually was green
      false_negative = len(pd.merge(not_anomalies, df_yellow, on=['Timestamp (US/Eastern)', 'Well'], how='inner')) # model predicted green but was actually yellow
      false_positive = len(pd.merge(anomalies, df_green, on=['Timestamp (US/Eastern)', 'Well'], how='inner')) # model predicted yellow, but was actually green

      recall = true_positive / (true_positive + false_negative)
      matthews_corrcoef = matthews_correlation(true_positive, true_negative, false_negative, false_positive)
      isolation_test_df.loc[len(isolation_test_df)] = {'estimator': int(e), 'contamination': round(c, 3), 'recall': recall, 'matthews': matthews_corrcoef}

  return isolation_test_df