In [None]:
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from src.unit_proccessing import  *
from src.utils.stats_utils import *
import pandas as pd
import matplotlib.pyplot as plt
from pyod.models.ecod import ECOD


In [None]:
with initialize(config_path='../configuration', version_base='1.1'):
    config = compose(config_name='main.yaml')

In [None]:
features_class = UnitDataProcessing(config)

# Answer hour Set ITEM Level Processing

##### Get Feature and process

In [None]:
df_item = features_class.df_item
df_unit = features_class.df_unit
feature_name = 'f__answer_hour_set'
score_name = 's__answer_hour_set'
df = df_item[~pd.isnull(df_item[feature_name])].copy()
df[feature_name] = df[feature_name].astype(float)

In [None]:
df[feature_name].hist(bins=48)

In [None]:
# Create a new column that has the hours mapped to order of frequency
sorted_hours = df[feature_name].value_counts().index
hour_to_rank = {hour: rank for rank, hour in enumerate(sorted_hours)}
# Sorting the DataFrame based on the 'frequency' column in descending order
df['frequency'] = df[feature_name].map(hour_to_rank)
df['frequency'].hist(bins=48)

##### FIND and Plot Anomalies 

In [None]:
#ECOD algorithm makes use of cumulative function and is non-parametric for detecting anomalies in answer time set.
# SET the contamintation parameter to 0.11. IT seems from different observations on distinct surveys to be a good threshold. Alternatively. you can use the FILTER function to define the optimal contamination parameter.
#  Carefully as using FILTER it hangs for a while 
model = ECOD(contamination=0.11)#FILTER(method='medfilt', random_state=42))
model.fit(df[['frequency']])

In [None]:
# Find Anomalies
df[score_name] = model.predict(df[['frequency']])

In [None]:
# Print the number and percentage of anomalies 
df[score_name].value_counts(), df[score_name].value_counts()/df[score_name].count()

In [None]:
df[df[score_name]==0]['frequency'].min(), df[df[score_name]==0]['frequency'].max()

In [None]:
# Set to zero "High frequency" anomalies as they should not be ocnsidered as such
df.loc[df['frequency']<=df[df[score_name]==0]['frequency'].min(),score_name] =0

In [None]:
bins = np.histogram_bin_edges(df[feature_name], bins=48)
data_true = df[df[score_name]==0][feature_name]
data_false = df[df[score_name]==1][feature_name]

plt.hist(data_true, bins=bins, alpha=0.5, color='blue', label='True')
plt.hist(data_false, bins=bins, alpha=0.5, color='red', label='False')
plt.show()

# Answer Time Set UNIT Level Processing

In [None]:
data = df.groupby(['interview__id'])[score_name].mean()
data = data.reset_index()
data[score_name].hist()

In [None]:

total_anomalies  = data[data[score_name]>0]['interview__id'].count()
total_unit = data['interview__id'].count()
perc = round(total_anomalies/total_unit,2)
print(f"UNITS with anomalies: {total_anomalies} of {total_unit}, ({perc}%)")

In [None]:
data = df.groupby(['interview__id','responsible'])[score_name].sum() / df.groupby(['interview__id','responsible'])[score_name].count()
data = data.reset_index()

resp_perc = {}
for resp in data['responsible'].unique():
    mask = (data['responsible']==resp)
    total_anomalies  = data[(data[score_name]>0)&mask]['interview__id'].count()
    total_unit = data[mask]['interview__id'].count()
    perc = round(total_anomalies/total_unit,2)
    resp_perc[resp] = [perc]
    print(f"{resp} - UNITS with anomalies: {total_anomalies} of {total_unit}, ({perc}%)")

resp_perc = pd.DataFrame.from_records(resp_perc).T
resp_perc = resp_perc.reset_index()
resp_perc.columns = ['responsible', 'perc']
resp_perc.set_index('responsible')['perc'].plot(kind='bar')

In [None]:
##########################################################################################################################################################

#### Visually check if it keeps on working by shifting the time zone

In [None]:
for tz in range(24):
    # Get the feature shifted
    df = df_item[~pd.isnull(df_item[feature_name])].copy()
    df[feature_name] = df[feature_name].astype(float)
    df[feature_name] = df[feature_name].apply(lambda x: (x-tz)%24).astype(float)
    # Create Frequency column with feature shifted
    sorted_hours = df[feature_name].value_counts().index
    hour_to_rank = {hour: rank for rank, hour in enumerate(sorted_hours)}
    # Sorting the DataFrame based on the 'frequency' column in descending order
    df['frequency'] = df[feature_name].map(hour_to_rank)

    # Train and find anomalies
    model = ECOD(contamination=0.11)#contamination=FILTER(method='savgol', random_state=42, sigma=30))#FILTER
    model.fit(df[['frequency']])
    df[score_name] = model.predict(df[['frequency']])
    df.loc[df['frequency']<=df[df[score_name]==0]['frequency'].min(),score_name] =0

    # Plot the anomalies for each time shift
    bins = np.histogram_bin_edges(df[feature_name], bins=48)
    data_true = df[df[score_name]==0][feature_name]
    data_false = df[df[score_name]==1][feature_name]
    plt.hist(data_true, bins=bins, alpha=0.5, color='blue', label='True')
    plt.hist(data_false, bins=bins, alpha=0.5, color='red', label='False')
    plt.title("Time shift +{}".format(str(tz)))
    plt.show()