In [1]:
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def sort_by_filters(ma_sorted):
  fi_sorted = []
  for ma_data in ma_sorted:
    deaths = ma_data[1].index[ma_data[1]["remainingLifetime"] == 0].tolist()
    lastIdx = ma_data[1].index[0]
    if len(deaths) > 0:
      for d in deaths:
        piece = ma_data[1].loc[lastIdx : d]
        if len(piece) > 2:
          fi_sorted.append(piece)
        lastIdx = d
    else:
      fi_sorted.append(ma_data[1])
  return fi_sorted
    
    
def remove_insane(possibly_insane):
  return [sane for sane in possibly_insane if sane["tempBoardAK0"].mean() > 0 and sane["tempBoardSLAVE"].mean() > 0]

# IN PLACE!!
def restrict_remainingLifetime(unrestricted, maximum):
  for df in unrestricted:
    df.loc[df["remainingLifetime"] > maximum, "remainingLifetime"] = maximum
    
def categorize_remainingLifetime(data):
  return [df.assign(urgency = pd.cut(df["remainingLifetime"], [-1, 7, 30, 10000])) for df in data]
    
def resample(data):
  return [df.set_index("timestamp").resample('30T').mean().interpolate(method='linear') for df in data]

def get_onehot_cpus(raw):
  cpus = raw["cpuType"].unique()
  onehot = {}
  for i in range(len(cpus)):
    cpu = cpus[i]
    onehot[cpu] = np.zeros(len(cpus)).tolist()
    onehot[cpu][i] = 1
  return onehot

def create_feature_vector(data, onehot, window_size):
  X = []
  Y = []
  
  for df in data:
    end = df["tempBoardAK0"].tail(1000)
    mins = end.rolling(window_size).min().tolist()
    maxs = end.rolling(window_size).max().tolist()
    means = end.rolling(window_size).mean().tolist()
    stds = end.rolling(window_size).std().tolist()
    
    
    for i in range(len(mins)):
      if i < window_size:
        continue
      X.append([mins[i], maxs[i], means[i], stds[i]] + onehot[df["cpuType"].iloc[i]])
      Y.append(df["urgency"].iloc[i])
  return X, Y

In [3]:
raw = pq.read_table("/dbfs/FileStore/tables/c99temp_train_pseudo_snappy-76e66.parquet")
v_raw = pq.read_table("/dbfs/FileStore/tables/c99temp_valid_pseudo_snappy-29080.parquet")
ma_sorted = raw.to_pandas().sort_values("timestamp").groupby("machineNumberPseudo")
v_ma_sorted = v_raw.to_pandas().sort_values("timestamp").groupby("machineNumberPseudo")
onehot = get_onehot_cpus(raw.to_pandas())

In [4]:
fi_sorted = sort_by_filters(ma_sorted)
fi_sorted_sane = remove_insane(fi_sorted)
# restrict_remainingLifetime(fi_sorted_sane, 100)
# rsmpld_30min = resample(fi_sorted_sane)
categorized = categorize_remainingLifetime(fi_sorted_sane)

v_fi_sorted = sort_by_filters(v_ma_sorted)
v_fi_sorted_sane = remove_insane(v_fi_sorted)
v_categorized = categorize_remainingLifetime(v_fi_sorted_sane)

In [5]:
from sklearn import svm
from sklearn.metrics import recall_score, precision_score
from sklearn.ensemble import RandomForestClassifier

In [6]:
X, Y = create_feature_vector(categorized, onehot, 20)
v_X, v_Y = create_feature_vector(v_categorized, onehot, 20)

In [7]:
classes, counts = np.unique(Y, return_counts=True)
weights = {classes[i]: 1/counts[i] for i in range(len(classes))}
weights

In [8]:
# clf = svm.LinearSVC(class_weight=weights)
# clf.fit(X, Y)

clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0, class_weight='balanced')
clf.fit(X, Y)

In [9]:
v_pred = clf.predict(v_X)
pred = clf.predict(X)
print(recall_score(Y, Y, average='macro'), precision_score(Y, Y, average='macro'))
print(recall_score(Y, pred, average='macro'), precision_score(Y, pred, average='macro'))
print(recall_score(v_Y, v_pred, average='macro'), precision_score(v_Y, v_pred, average='macro'))

In [10]:
F = np.fft.fft(rsmpld_30min[130])
N = int(len(F) / 2)
fa = 1 / (30 * 60)
frq = np.linspace(0, fa/2, N, endpoint=True)
T = 1 / (frq * 60 * 60)
fig, ax = plt.subplots()
ax.set_xlim(0,30)
ax.plot(T, np.absolute(F[:N]))
# ax.plot(rsmpld_30min[3])
display(fig)

In [11]:
cnt = int(len(fi_sorted_sane) / 2)

fig, axs = plt.subplots(cnt, 1, sharex=True, figsize=(cnt * 5, 8))


for plotidx in range(cnt):
  axs[plotidx].plot(fi_sorted_sane[plotidx]["timestamp"], fi_sorted_sane[plotidx]["tempBoardAK0"])
  axs[plotidx].plot(fi_sorted_sane[plotidx]["timestamp"], fi_sorted_sane[plotidx]["tempBoardSLAVE"])
  axs[plotidx].plot(fi_sorted_sane[plotidx]["timestamp"], fi_sorted_sane[plotidx]["remainingLifetime"])

# display(fig)
fig.savefig("/dbfs/FileStore/tables/fullplot.png")