<a href="https://colab.research.google.com/github/vignesh-0510/SolarFlareExplainableWindowDetection/blob/main/phase_3_localization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install shap imbalanced-learn



In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.ensemble import EasyEnsembleClassifier, BalancedRandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score, jaccard_score
from imblearn.metrics import geometric_mean_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import torch
import tqdm
import pickle
from collections import deque

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [86]:
def extract_start_end_time(filename):
  """Extracts start and end time from a filename string.

  Args:
    filename: The filename string in the format
      'FQ_ar146_s2010-08-29T17:12:00_e2010-08-30T05:00:00.csv'.

  Returns:
    A tuple containing the start time and end time strings, or None if the
    filename does not match the expected format.
  """
  try:
    parts = filename.split('_')
    start_time_part = parts[2][1:]  # Remove 's' prefix
    end_time_part = parts[3][1:]   # Remove 'e' prefix
    return start_time_part, end_time_part
  except IndexError:
    return None, None

def process_filename(file_path):
  file_dict = {}
  filename = os.path.basename(file_path).split('.csv')[0]
  file_dict['filename'] = filename
  category = ''
  if filename[:2] == 'FQ':
    category = 'FQ'
    sub_category = 'NA'
  else:
    category = filename[0]
    sub_category = filename.split('@')[0][1:]
  file_dict['category'] = category
  if category in ('FQ', 'A', 'B', 'C'):
    file_dict['class'] = 'NF'
  else:
    file_dict['class'] = 'FL'

  file_dict['sub_category'] = sub_category
  start_time, end_time = extract_start_end_time(filename)
  file_dict['start_time'] = start_time
  file_dict['end_time'] = end_time
  return file_dict

def min_max_scaling(data, min_val, max_val):
  norm_data = (data - min_val) / (max_val - min_val)
  return norm_data

def process_file(df, interacting_columns, meta_dict, file_info, frequency_modes= 10):
  modes = frequency_modes // 2
  data_arr = np.zeros((frequency_modes*len(interacting_columns),1))

  data_col_list = []
  for i, col in enumerate(interacting_columns):
    y = df[col].values
    y = min_max_scaling(y, meta_dict[col]['min'], meta_dict[col]['max'])

    y_f = torch.fft.rfft(torch.tensor(y))
    y_f = torch.cat((y_f[:modes], y_f[-modes:]))
    data_col_list.extend([f'{col}_real_{c}' for c in range(frequency_modes)])
    data_arr[i*frequency_modes: (i+1)*frequency_modes,0] = torch.real(y_f).numpy()
  result_df = pd.DataFrame(data_arr.T, columns=data_col_list)
  result_df['class'] = file_info['class']
  return result_df




In [70]:
file_path = '/content/M1.0@10065:Primary_ar5298_s2015-03-14T09:24:00_e2015-03-14T21:12:00.csv'
model = None
meta_dict = None

with open('/content/drive/MyDrive/ADM/results/part_123/rf_clf.pkl', 'rb') as f:
  model = pickle.load(f)

with open('/content/drive/MyDrive/ADM/partition1/swan-sf-metadata.pkl', 'rb') as f:
  meta_dict = pickle.load(f)

interacting_columns = ['TOTUSJH', 'TOTPOT', 'TOTUSJZ']
file_info = process_filename(file_path)
instance_df = pd.read_csv(file_path, sep='\t', usecols=interacting_columns)

In [89]:
class Node:
  def __init__(self, start=None, size=None, confidence=None, parent=None):
    self.window_start = start
    self.window_size = size
    self.window_end = start + size
    self.confidence = confidence
    self.parent = parent

In [125]:
window_start = 0
window_size = instance_df.shape[0]
# window_size = 60
# step_size = window_size // 10
step_size = 10

freq_modes = 10


In [122]:
def run_analysis(df,interacting_columns, model, file_info,freq_modes = 10):

  freq_df = process_file(df, interacting_columns, meta_dict, file_info, freq_modes)
  X_df = freq_df[[col for col in freq_df.columns if col != 'class']]
  y_df = freq_df['class']
  c = model.predict_proba(X_df)
  # print(f'confidence of FL: {c[0,0]}')
  # print(f'normalized_conf {1-(c[0] - y_df["class"].values)}')
  return c[0][0]

In [126]:
def run_tail_chopping_algorithm():
  parent_window_start = 0
  parent_window_size = window_size
  parent_conf = None

  q = deque()
  aux_q = deque()
  root = Node(parent_window_start, parent_window_size, parent_conf, None)
  q.append(root)
  q.append(None)
  q_has_changed = False

  while len(q)>=1:

    cur = q.popleft()

    if cur is None:
      if q_has_changed:
        q_has_changed = False
        aux_q.clear()
        q.append(None)
        continue
      else:
        break

    if cur.confidence is None:
      parent_conf = run_analysis(instance_df,interacting_columns, model, file_info)
      cur.confidence = parent_conf
      root = cur

    child_1_start = cur.window_start
    child_2_start = cur.window_start + step_size
    child_window_size = cur.window_size - step_size

    child_1_df = instance_df.iloc[child_1_start:child_1_start + child_window_size]
    child_2_df = instance_df.iloc[child_2_start:child_2_start + child_window_size]

    child_1_conf = run_analysis(child_1_df,interacting_columns, model, file_info)
    child_2_conf = run_analysis(child_2_df,interacting_columns, model, file_info)
    print(f'parent_conf[{cur.window_start}-{cur.window_end}]: {cur.confidence}, child_1_confidence[{child_1_start}-{child_1_start + child_window_size}]: {child_1_conf}, child_2_conficence[{child_2_start}-{child_2_start + child_window_size}]: {child_2_conf}')
    if child_1_conf > child_2_conf and child_1_conf >= parent_conf:
      q_has_changed = True
      q.append(Node(child_1_start, child_window_size, child_1_conf, cur))

    elif child_2_conf > child_1_conf and child_2_conf >= parent_conf:
      q_has_changed = True
      q.append(Node(child_2_start, child_window_size, child_2_conf, cur))

    elif child_1_conf == child_2_conf and child_1_conf >= parent_conf:
      q_has_changed = True
      q.append(Node(child_1_start, child_window_size, child_1_conf, cur))
      q.append(Node(child_2_start, child_window_size, child_2_conf, cur))

    elif parent_conf > child_1_conf and parent_conf > child_2_conf:
      aux_q.append(cur)
    else:
      print(f'parent: {parent_conf}, child 1: {child_1_conf}, child 2: {child_2_conf}')
      break

  max_confidence = -1
  max_node = root
  while len(aux_q) > 0:
    cur = aux_q.popleft()
    if cur.confidence > max_confidence:
      max_confidence = cur.confidence
      max_node = cur
  print(f'Localized window start: {max_node.window_start}, end: {max_node.window_start + max_node.window_size}, confidence: {max_confidence}')
  return max_node.window_start, max_node.window_size, max_confidence

In [127]:
window_start, window_size, confidence = run_tail_chopping_algorithm()

parent_conf[0-60]: 0.7771713436938831, child_1_confidence[0-50]: 0.7371419836543659, child_2_conficence[10-60]: 0.7961788188666321
parent_conf[10-60]: 0.7961788188666321, child_1_confidence[10-50]: 0.7023399253295465, child_2_conficence[20-60]: 0.7524948527677786
Localized window start: 10, end: 60, confidence: 0.7961788188666321
