In [None]:
import pandas as pd
import os
import gc
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.style as style

In [None]:
sns.set_context('paper')
matplotlib.rcParams['font.family'] = 'sans-serif'
style.use('ggplot')
plt.style.use('seaborn')

In [None]:
window_size = 100

def calculate_derivatives(values):
    timestamp = range(values.size)
    dv = {'y_p': np.diff(values) / np.diff(timestamp),
          'x_p': np.array((np.array(timestamp)[:-1] + np.array(timestamp)[1:]) / 2 + 0.5).astype(int)}
    return pd.Series(list(np.insert(dv['y_p'], 0, 0, axis=0)))

def calculate_second_derivatives(in_values):
    values = calculate_derivatives(in_values)
    timestamp = range(values.size)
    dv = {'y_p': np.diff(values) / np.diff(timestamp),
          'x_p': np.array((np.array(timestamp)[:-1] + np.array(timestamp)[1:]) / 2 + 0.5).astype(int)}
    return pd.Series(list(np.insert(dv['y_p'], 0, 0, axis=0)))

def calculate_rolling_mean(values, window_size=window_size):
    res = values.rolling(window_size).mean()
    res[:window_size] = 0
    return res

def calculate_rolling_variance(values, window_size=window_size):
    res = values.rolling(window_size).var()
    res[:window_size] = 0
    return res

def calculate_rolling_skewness(values, window_size=window_size):
    res = values.rolling(window_size).skew()
    res[:window_size] = 0
    return res

def calculate_rolling_kurtosis(values, window_size=window_size):
    res = values.rolling(window_size).kurt()
    res[:window_size] = 0
    return res

In [None]:
feature_funcs = {
    'First Derivative': calculate_derivatives,
    'Second Derivative': calculate_second_derivatives,
    'Rolling Mean': calculate_rolling_mean,
    'Rolling Variance': calculate_rolling_variance,
    'Rolling Skewness': calculate_rolling_skewness,
    'Rolling Kurtosis': calculate_rolling_kurtosis,
}

## Yahoo

In [None]:
y_dataset_path = '/home/szamani/PycharmProjects/anomaly_detection/dataset/Yahoo'
yahoo_file = 'real_{}.csv'
yagoo_ind = [i for i in range(1, 68)]

In [None]:
all_yahoo_df = []
for i in yagoo_ind:
    try:
        df = pd.read_csv(os.path.join(y_dataset_path, yahoo_file.format(i)))
        all_yahoo_df.append(df)
    except:
        pass
all_yahoo_df.sort(key=lambda x: x.shape[0])

In [None]:
print(all_yahoo_df[1])
print(all_yahoo_df[1]['is_anomaly'].value_counts())
print(all_yahoo_df[1]['value'].std())

### Removing Zero Anomaly datasets

In [None]:
anomaly_count_df = pd.DataFrame([(df['is_anomaly'].sum() / df.shape[0]) * 100 for df in all_yahoo_df], columns=['anomaly_percentage'])
anomaly_count_df = anomaly_count_df.reset_index()
anomaly_count_df

In [None]:
anomaly_count_df[anomaly_count_df['anomaly_percentage'] == 0]

In [None]:
ind = 55
plt.figure(figsize=(8,4))
lp = sns.lineplot(
            data=all_yahoo_df[ind],
            x='timestamp', y='value',
            palette=sns.color_palette(['lightcoral', 'lightskyblue', 'lightgreen', 'c']),
        )

plt.title('Anomaly-Free Yahoo! Dataset', weight='bold')
plt.xlabel('Time', fontsize=12, weight='bold')
plt.ylabel('Value', fontsize=12, weight='bold')

anomalous = all_yahoo_df[ind][all_yahoo_df[ind]['is_anomaly'] == 1]
print(anomalous.shape)

plt.plot(anomalous['timestamp'], anomalous['value'],'ro')
plt.xticks(fontsize=10, weight='bold')
plt.yticks(fontsize=10, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/yahoo/yahoo_clean_dataset.png', bbox_inches='tight')

In [None]:
no_anomaly_train = pd.DataFrame([(df.iloc[:int(df.shape[0] * .95)]['is_anomaly'].sum() / df.shape[0]) * 100 for df in all_yahoo_df], columns=['anomaly_train_section'])
no_anomaly_train = no_anomaly_train.reset_index()
no_anomaly_train = no_anomaly_train[no_anomaly_train['anomaly_train_section'] == 0]
print(no_anomaly_train.shape)
no_anomaly_train

In [None]:
ind = 30
plt.figure(figsize=(8,4))
lp = sns.lineplot(
            data=all_yahoo_df[ind],
            x='timestamp', y='value',
            palette=sns.color_palette(['lightcoral', 'lightskyblue', 'lightgreen', 'c']),
        )

plt.title('Anomaly Imbalanced Yahoo! Dataset', weight='bold')
plt.xlabel('Time', fontsize=12, weight='bold')
plt.ylabel('Value', fontsize=12, weight='bold')

anomalous = all_yahoo_df[ind][all_yahoo_df[ind]['is_anomaly'] == 1]
print(anomalous.shape)

plt.plot(anomalous['timestamp'], anomalous['value'],'ro')
plt.xticks(fontsize=10, weight='bold')
plt.yticks(fontsize=10, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/yahoo/yahoo_unbalanced_dataset.png', bbox_inches='tight')

In [None]:
print(len(all_yahoo_df))
for i in reversed(no_anomaly_train['index'].tolist()):
    all_yahoo_df.pop(i)
print(len(all_yahoo_df))

### Anomaly Percentage Analysis

In [None]:
anomaly_count_df = pd.DataFrame([(df['is_anomaly'].sum() / df.shape[0]) * 100 for df in all_yahoo_df], columns=['anomaly_percentage'])
anomaly_count_df = anomaly_count_df.reset_index()
anomaly_count_df

In [None]:
plt.figure(figsize=(10,4))
lp = sns.lineplot(
            data=anomaly_count_df,
            x='index', y='anomaly_percentage',
            palette=sns.color_palette(['lightcoral', 'lightskyblue', 'lightgreen', 'c']),
        )

plt.title('Yahoo! Datasets Anomaly Percentage Distribution', weight='bold')
print('Anomaly Percentage of Time Series Average: {}%'.format(str(anomaly_count_df['anomaly_percentage'].mean())[:4]))
plt.xlabel('Time Series', fontsize=12, weight='bold')
plt.ylabel('Anomaly Percentage', fontsize=12, weight='bold')
plt.xticks(fontsize=10, weight='bold')
plt.yticks(fontsize=10, weight='bold')

fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/yahoo/yahoo_anomaly_percentage.png', bbox_inches='tight')

### Time Series Length Analysis

In [None]:
len_df = pd.DataFrame.from_dict({'len': [df.shape[0] for df in all_yahoo_df]})
len_df.head()

In [None]:
yahoo_len_df = len_df.groupby(['len'])['len'].agg(['count']).reset_index()
yahoo_len_df = yahoo_len_df.sort_values(by=['count'])
print(yahoo_len_df)

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(data=yahoo_len_df, x="len", y="count",
           palette=sns.color_palette("colorblind"))
plt.title('Yahoo! Datasets Length Distribution', weight='bold')
plt.xlabel('Length', fontsize=12, weight='bold')
plt.ylabel('Count', fontsize=12, weight='bold')
plt.xticks(fontsize=10, weight='bold')
plt.yticks(fontsize=10, weight='bold')

fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/yahoo/yahoo_length_distro.png', bbox_inches='tight')

### Time Series Plot

In [None]:
plt.figure(figsize=(10,4))
lp = sns.lineplot(
            data=all_yahoo_df[-1],
            x='timestamp', y='value',
            palette=sns.color_palette(['lightcoral', 'lightskyblue', 'lightgreen', 'c']),
        )

plt.title('Sample Yahoo! Dataset', weight='bold')
plt.xlabel('Time', fontsize=12, weight='bold')
plt.ylabel('Value', fontsize=12, weight='bold')

anomalous = all_yahoo_df[-1][all_yahoo_df[-1]['is_anomaly'] == 1]

plt.plot(anomalous['timestamp'], anomalous['value'],'ro')
plt.xticks(fontsize=10, weight='bold')
plt.yticks(fontsize=10, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/yahoo/yahoo_long_anomaly.png', bbox_inches='tight')

In [None]:
plt.figure(figsize=(5,3))
lp = sns.lineplot(
            data=all_yahoo_df[-1].iloc[400:500, ],
            x='timestamp', y='value',
            palette=sns.color_palette(['lightcoral', 'lightskyblue', 'lightgreen', 'c']),
        )

plt.title('Anomalies Zoomed in', weight='bold')
plt.xlabel('Time', fontsize=12, weight='bold')
plt.ylabel('Value', fontsize=12, weight='bold')

anomalous = all_yahoo_df[-1].iloc[400:500, ][all_yahoo_df[-1].iloc[400:500, ]['is_anomaly'] == 1]

plt.plot(anomalous['timestamp'], anomalous['value'],'ro')
plt.xticks(fontsize=10, weight='bold')
plt.yticks(fontsize=10, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/yahoo/yahoo_anomaly_zoom.png', bbox_inches='tight')

### Feature Analysis

In [None]:
yahoo_df = all_yahoo_df[36]  # 0 and 36 selected

values = {'Value': yahoo_df.iloc[100:]['value']}
for feature, function in feature_funcs.items():
    values[feature] = function(yahoo_df['value']).iloc[100:]

select_yahoo_features = pd.DataFrame.from_dict(values)
# select_yahoo_features

In [None]:
feature_corr = select_yahoo_features.corr()
# feature_corr

In [None]:
mask = np.zeros_like(feature_corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
feature_corr[mask] = np.nan
# feature_corr

In [None]:
ax = sns.heatmap(
    feature_corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True,
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)
# plt.title('Feature Correlation', weight='bold')
plt.xticks(fontsize=10, weight='bold')
plt.yticks(fontsize=10, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/yahoo/yahoo_feature_corr2.png', bbox_inches='tight')

# Concept Datasets

In [None]:
dataset_path = '/home/szamani/PycharmProjects/anomaly_detection/dataset/concepts'
power_dir = 'powers'
light_dir = 'sensor/light'

power_supply = 'power_supply.csv'
# power_transform = 'power_transform.csv'
light_sensor = ['{}.csv'.format(i) for i in range(1, 59)]

In [None]:
power_df = pd.read_csv(os.path.join(dataset_path, power_dir, power_supply))
power_df

In [None]:
power_df['day'] = power_df['hour'].apply(lambda x: int(x/24) + 1)
power_df

In [None]:
plt.figure(figsize=(20,7))
lp = sns.lineplot(
            data=power_df,
            x='hour', y='supply',
            palette=sns.color_palette(['lightcoral', 'lightskyblue', 'lightgreen', 'c']),
        )

plt.title('Power Dataset Time Series', fontsize=15, weight='bold')
plt.xlabel('Hour', fontsize=15, weight='bold')
plt.ylabel('Power Supply', fontsize=15, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.yticks(fontsize=12, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/power/power_data.png', bbox_inches='tight', dpi=500)

In [None]:
plt.figure(figsize=(20,7))
lp = sns.lineplot(
            data=power_df.iloc[:744,],
            x='hour', y='supply',
            palette=sns.color_palette(['lightcoral', 'lightskyblue', 'lightgreen', 'c']),
        )

plt.title('Power Dataset Time Series (One-Month Period)', fontsize=15, weight='bold')
plt.xlabel('Hour', fontsize=15, weight='bold')
plt.ylabel('Power Supply', fontsize=15, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.yticks(fontsize=12, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/power/power_month_period.png', bbox_inches='tight')

In [None]:
plt.figure(figsize=(20,7))
lp = sns.lineplot(
            data=power_df.iloc[:8760,],
            x='hour', y='supply',
            palette=sns.color_palette(['lightcoral', 'lightskyblue', 'lightgreen', 'c']),
        )

plt.title('Power Dataset Time Series (One-Year Period)', fontsize=15, weight='bold')
plt.xlabel('Hour', fontsize=15, weight='bold')
plt.ylabel('Power Supply', fontsize=15, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.yticks(fontsize=12, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/power/power_year_period.png', bbox_inches='tight')

In [None]:
power_daily_avg = power_df.groupby(['day'])['supply'].mean().reset_index()
power_daily_avg

In [None]:
plt.figure(figsize=(20,7))
lp = sns.lineplot(
            data=power_daily_avg,
            x='day', y='supply',
            palette=sns.color_palette(['lightcoral', 'lightskyblue', 'lightgreen', 'c']),
        )

plt.title('Power Dataset Time Series (Daily Average)', fontsize=15, weight='bold')
plt.xlabel('Day', fontsize=15, weight='bold')
plt.ylabel('Daily Average Power Supply', fontsize=15, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.yticks(fontsize=12, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/power/power_daily_average.png', bbox_inches='tight')

### Feature Analysis

In [None]:
values = {'Value': power_df.iloc[100:]['supply']}
for feature, function in feature_funcs.items():
    values[feature] = function(power_df['supply']).iloc[100:]

power_features = pd.DataFrame.from_dict(values)
power_features

In [None]:
feature_corr = power_features.corr()
feature_corr

In [None]:
mask = np.zeros_like(feature_corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
feature_corr[mask] = np.nan
feature_corr

In [None]:
ax = sns.heatmap(
    feature_corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True,
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)
# plt.title('Feature Correlation', weight='bold')
plt.xticks(fontsize=10, weight='bold')
plt.yticks(fontsize=10, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/power/power_feature_corr.png', bbox_inches='tight')

In [None]:
all_light_df = []
for i in light_sensor:
    try:
        df = pd.read_csv(os.path.join(dataset_path, light_dir, i))
        all_light_df.append(df)
    except:
        pass
all_light_df.sort(key=lambda x: x.shape[0])
len_df = pd.DataFrame.from_dict({'len': [df.shape[0] for df in all_light_df]})
print(len(all_light_df))
# len_df

In [None]:
all_light_df[0]

### Time Series Length Analysis

In [None]:
len_df = pd.DataFrame.from_dict({'len': [df.shape[0] for df in all_light_df]})
print(len_df.head(5))
print(len_df.tail(5))
# len_df

In [None]:
light_range_df = len_df.groupby(pd.cut(len_df['len'], np.arange(0, 80000, 10000))).count()
light_range_df.index.names = ['Length_Range']
light_range_df = light_range_df.reset_index()
light_range_df

In [None]:
plt.figure(figsize=(11, 5))
sns.barplot(data=light_range_df, x="Length_Range", y="len",
           palette=sns.color_palette("colorblind"))
plt.title('Light Datasets Length Range Distribution', weight='bold')
plt.xlabel('Length Range', fontsize=12, weight='bold')
plt.ylabel('Count', fontsize=12, weight='bold')
plt.xticks(fontsize=10, weight='bold')
plt.yticks(fontsize=10, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/light/light_length_distro.png', bbox_inches='tight')

### Time Series Plot

In [None]:
print(all_light_df[0].shape)

plt.figure(figsize=(15,7))
lp = sns.lineplot(
            data=all_light_df[0],
            x='time', y='light',
            palette=sns.color_palette(['lightcoral', 'lightskyblue', 'lightgreen', 'c']),
        )

plt.title('Light Dataset Time Series (Short)', fontsize=15, weight='bold')
plt.xlabel('Time', fontsize=15, weight='bold')
plt.ylabel('Light', fontsize=15, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.yticks(fontsize=12, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/light/light_short_data.png', bbox_inches='tight')

In [None]:
print(all_light_df[13].shape)

plt.figure(figsize=(17,7))
lp = sns.lineplot(
            data=all_light_df[13],
            x='time', y='light',
            palette=sns.color_palette(['lightcoral', 'lightskyblue', 'lightgreen', 'c']),
        )

plt.title('Light Dataset Time Series (Medium)', fontsize=15, weight='bold')
plt.xlabel('Time', fontsize=15, weight='bold')
plt.ylabel('Light', fontsize=15, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.yticks(fontsize=12, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/light/light_medium_data.png', bbox_inches='tight')

In [None]:
print(all_light_df[-1].shape)

plt.figure(figsize=(20,7))
lp = sns.lineplot(
            data=all_light_df[-1],
            x='time', y='light',
            palette=sns.color_palette(['lightcoral', 'lightskyblue', 'lightgreen', 'c']),
        )

plt.title('Light Dataset Time Series (Long)', fontsize=15, weight='bold')
plt.xlabel('Time', fontsize=15, weight='bold')
plt.ylabel('Light', fontsize=15, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.yticks(fontsize=12, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/light/light_long_data.png', bbox_inches='tight')

**Given that light sensor data recording is not regular and could be every 1-3 minutes, we cannot have an accurate daily average or one-week period plots (unlike power supply dataset which is recorded every hour)**

### Feature Analysis

In [None]:
light_df = all_light_df[45]  # 3 and 45 selected

values = {'Value': light_df.iloc[100:]['light']}
for feature, function in feature_funcs.items():
    values[feature] = function(light_df['light']).iloc[100:]

select_light_features = pd.DataFrame.from_dict(values)
# select_light_features

In [None]:
feature_corr = select_light_features.corr()
# feature_corr

In [None]:
mask = np.zeros_like(feature_corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
feature_corr[mask] = np.nan
# feature_corr

In [None]:
ax = sns.heatmap(
    feature_corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True,
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)
# plt.title('Feature Correlation', weight='bold')
plt.xticks(fontsize=10, weight='bold')
plt.yticks(fontsize=10, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/light/light_feature_corr2.png', bbox_inches='tight')

In [None]:
moa_df = pd.read_csv(os.path.join(dataset_path, moa_dir, moa_files[0]))
moa_df

In [None]:
plt.figure(figsize=(20,7))
lp = sns.lineplot(
            data=moa_df,
            x='time', y='value',
            palette=sns.color_palette(['lightcoral', 'lightskyblue', 'lightgreen', 'c']),
        )

plt.xlabel('Time', fontsize=15, weight='bold')
plt.ylabel('Value', fontsize=15, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.yticks(fontsize=12, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/moa/moa_abrupt.png', bbox_inches='tight', dpi=500)

### Feature Analysis

In [None]:
values = {'Value': moa_df.iloc[100:]['value']}
for feature, function in feature_funcs.items():
    values[feature] = function(moa_df['value']).iloc[100:]

moa_features = pd.DataFrame.from_dict(values)
moa_features

In [None]:
feature_corr = moa_features.corr()
feature_corr

In [None]:
mask = np.zeros_like(feature_corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
feature_corr[mask] = np.nan
feature_corr

In [None]:
ax = sns.heatmap(
    feature_corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True,
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)
plt.xticks(fontsize=10, weight='bold')
plt.yticks(fontsize=10, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/moa/moa_abrupt_feature_corr.png', bbox_inches='tight')

In [None]:
moa_df = pd.read_csv(os.path.join(dataset_path, moa_dir, moa_files[1]))
moa_df

In [None]:
plt.figure(figsize=(20,7))
lp = sns.lineplot(
            data=moa_df,
            x='time', y='value',
            palette=sns.color_palette(['lightcoral', 'lightskyblue', 'lightgreen', 'c']),
        )

plt.xlabel('Time', fontsize=15, weight='bold')
plt.ylabel('Value', fontsize=15, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.yticks(fontsize=12, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/moa/moa_gradual.png', bbox_inches='tight', dpi=500)

### Feature Analysis

In [None]:
values = {'Value': moa_df.iloc[100:]['value']}
for feature, function in feature_funcs.items():
    values[feature] = function(moa_df['value']).iloc[100:]

moa_features = pd.DataFrame.from_dict(values)
moa_features

In [None]:
feature_corr = moa_features.corr()
feature_corr

In [None]:
mask = np.zeros_like(feature_corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
feature_corr[mask] = np.nan
feature_corr

In [None]:
ax = sns.heatmap(
    feature_corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True,
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)
plt.xticks(fontsize=10, weight='bold')
plt.yticks(fontsize=10, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/moa/moa_gradual_feature_corr.png', bbox_inches='tight')

In [None]:
moa_df = pd.read_csv(os.path.join(dataset_path, moa_dir, moa_files[2]))
moa_df

In [None]:
plt.figure(figsize=(20,7))
lp = sns.lineplot(
            data=moa_df,
            x='time', y='value',
            palette=sns.color_palette(['lightcoral', 'lightskyblue', 'lightgreen', 'c']),
        )

plt.xlabel('Time', fontsize=15, weight='bold')
plt.ylabel('Value', fontsize=15, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.yticks(fontsize=12, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/moa/moa_incremental.png', bbox_inches='tight', dpi=500)

### Feature Analysis

In [None]:
values = {'Value': moa_df.iloc[100:]['value']}
for feature, function in feature_funcs.items():
    values[feature] = function(moa_df['value']).iloc[100:]

moa_features = pd.DataFrame.from_dict(values)
moa_features

In [None]:
feature_corr = moa_features.corr()
feature_corr

In [None]:
mask = np.zeros_like(feature_corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
feature_corr[mask] = np.nan
feature_corr

In [None]:
ax = sns.heatmap(
    feature_corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True,
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)
plt.xticks(fontsize=10, weight='bold')
plt.yticks(fontsize=10, weight='bold')
fig1 = plt.gcf()
plt.show()
plt.draw()
# fig1.savefig('/home/szamani/Desktop/mcmaster/Thesis/figures/eda/moa/moa_incremental_feature_corr.png', bbox_inches='tight')