In [21]:
%load_ext autoreload
%autoreload 2

import datetime

import numpy as np
import pandas as pd
import gudhi

from traffic.core import Traffic, Flight
from functions.data_filtering import ICAO_codes, large_gap_filter
from functions.data_processing import flight_persistence, sublevelset_persistence, remove_outliers_z_score, remove_outliers_dbscan
from functions.data_loading import linkage_cluster_persistances, flights_from_query, get_flight_persistances

from scipy.cluster.hierarchy import fcluster
from matplotlib import pyplot as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
# load minimum data set
df = pd.read_csv("data/go_arounds_augmented.csv", low_memory=False)
df["time"] = pd.to_datetime(df["time"])

start = datetime.datetime(year=2019, month=1, day=1).replace(
    tzinfo=datetime.timezone.utc
)
stop = datetime.datetime(year=2019, month=3, day=1).replace(
    tzinfo=datetime.timezone.utc
)

In [31]:
with_GA = df.query(f"has_ga & (@start <= time <= @stop)")
without_GA = df.query(f"not has_ga & (@start <= time <= @stop)")

#plt.hist(with_GA["n_approaches"])

In [32]:
without_GA = without_GA.sample(n=with_GA.shape[0], random_state=42)
query = pd.concat([with_GA, without_GA], axis=0)

In [33]:
file_name = "balanced1_2"
flights, labels = flights_from_query(query, file_name, load_results=False)
data = get_flight_persistances(flights, file_name, load_results=False)

100%|██████████| 10620/10620 [01:51<00:00, 95.30it/s] 
100%|██████████| 7906/7906 [01:47<00:00, 73.74it/s] 
100%|██████████| 7906/7906 [01:31<00:00, 86.39it/s] 
100%|██████████| 7906/7906 [02:39<00:00, 49.69it/s]
100%|██████████| 7906/7906 [00:34<00:00, 230.75it/s]


In [26]:
approach_1 = df.query(f"n_approaches==1 & (@start <= time <= @stop)")
approach_2 = df.query(f"n_approaches==2 & (@start <= time <= @stop)")
approach_3 = df.query(f"n_approaches==3 & (@start <= time <= @stop)")
approach_4 = df.query(f"n_approaches==4 & (@start <= time <= @stop)")

count_4 = approach_4.shape[0]
count_3 = min(approach_3.shape[0], count_4*2)
count_2 = min(approach_2.shape[0], count_3*2)
count_1 = min(approach_1.shape[0], count_2*2)

approach_3 = approach_3.sample(n=count_3, random_state=42)
approach_2 = approach_2.sample(n=count_2, random_state=42)
approach_1 = approach_1.sample(n=count_1, random_state=42)
query = pd.concat([approach_1, approach_2, approach_3, approach_4], axis=0)

In [27]:
file_name = "unbalanced1_2_3_4"
flights, labels = flights_from_query(query, file_name, load_results=False)
data = get_flight_persistances(flights, file_name, load_results=False)

100%|██████████| 5580/5580 [00:42<00:00, 131.74it/s]
100%|██████████| 3893/3893 [00:55<00:00, 69.88it/s] 
100%|██████████| 3893/3893 [00:41<00:00, 94.72it/s] 
100%|██████████| 3893/3893 [01:07<00:00, 58.02it/s]
100%|██████████| 3893/3893 [00:16<00:00, 230.87it/s]


In [28]:
approach_1 = df.query(f"n_approaches==1 & (@start <= time <= @stop)")
approach_2 = df.query(f"n_approaches==2 & (@start <= time <= @stop)")
approach_3 = df.query(f"n_approaches==3 & (@start <= time <= @stop)")

count = approach_3.shape[0]

approach_2 = approach_2.sample(n=count, random_state=42)
approach_1 = approach_1.sample(n=count, random_state=42)
query = pd.concat([approach_1, approach_2, approach_3], axis=0)

In [29]:
file_name = "balanced1_2_3"
flights, labe0ls = flights_from_query(query, file_name, load_results=False)
data = get_flight_persistances(flights, file_name, load_results=False)

100%|██████████| 4599/4599 [00:31<00:00, 145.58it/s]
100%|██████████| 3037/3037 [00:45<00:00, 67.31it/s] 
100%|██████████| 3037/3037 [00:40<00:00, 74.66it/s] 
100%|██████████| 3037/3037 [01:06<00:00, 45.36it/s]
100%|██████████| 3037/3037 [00:12<00:00, 248.20it/s]


In [32]:
trees, paths = sublevelset_persistence(flights[:1000], "geoaltitude")

100%|██████████| 1000/1000 [00:06<00:00, 159.82it/s]


In [33]:
path = f"../../data/linkage_clustering/cyclic_dataset.pkl"
Z = linkage_cluster_persistances(trees, path=path, load_results=False)

100%|██████████| 1000/1000 [03:45<00:00,  4.43it/s]


In [34]:
clustering = fcluster(Z, t=10, criterion='maxclust')

for i in range(1, len(np.unique(clustering))+1):
    print(f"{i}: {sum(clustering == i)}")

1: 55
2: 12
3: 554
4: 345
5: 11
6: 6
7: 3
8: 1
9: 2
10: 11


In [38]:
from matplotlib import pyplot as plt

ind = 10

flight = flights[np.where(clustering == ind)[0][0]]
tree = trees[np.where(clustering == ind)[0][0]]
points = paths[np.where(clustering == ind)[0][0]]
before = flight.data[["geoaltitude"]].to_numpy()
before = np.column_stack((np.arange(len(data)), before))

dbscan_paths        = before[remove_outliers_dbscan(before, 25, eps = 0.05)]
z_score_paths       = before[remove_outliers_z_score(before)]

path_filters = [(before, "before"), (z_score_paths, "z-score"), (dbscan_paths, "dbscan")]

fig, axs = plt.subplots(3)#, sharex=True, sharey=True)
for i, (path, name) in enumerate(path_filters):
    axs[i].scatter(path[:, 0], path[:, 1])
    axs[i].set_title(name)

fig.set_figheight(10)
fig.set_figwidth(10)

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 10620 and the array at index 1 has size 1800

In [8]:
gudhi.persistence_graphical_tools.plot_persistence_diagram(tree, axes=axs[1])
print(tree)

ModuleNotFoundError: No module named 'matplotlib'.


AttributeError: 'numpy.ndarray' object has no attribute 'add_patch'

In [None]:
fig = plt.scatter(points[:, 1], points[:, 0])
plt.show()