In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%load_ext memory_profiler

In [None]:
import pandas as pd
from utils.import_utils import *
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from src.feature_processing import *
from src.item_processing import  *
from src.utils.general_utils import *
from src.utils.stats_utils import *
import plotly.express as px
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
with initialize(config_path='../configuration', version_base='1.1'):
    config = compose(config_name='main.yaml')

In [None]:
features_class = ItemFeatureProcessing(config)

In [None]:
df_item = features_class.df_item
self = features_class

# First Digit ITEM Level Processing

In [None]:
feature_name = 'f__numeric_response'
score_name = 's__numeric_response'
df = df_item[~pd.isnull(df_item[feature_name])].copy()
# Select only those variables that have at least three distinct values and more than one hundred records
valid_variables =  self.filter_variable_name_by_frequency(df, feature_name, frequency=100, min_unique_values=3)

In [None]:
# Select only those variables that have at least three different order of magnitude
valid_variables = filter_variables_by_magnitude(df, feature_name, valid_variables, min_order_of_magnitude=3)

In [None]:
# Computes the Jensen divergence for each variable_name and responsible on the first digit distribution.
# Jensen's divergence returns a value between (0, 1) of how much the first digit distribution of specific responsible is similar to the first digit distribution of all others.
# Higher the value higher is the difference. 
# The Bendford Jensen divergence is calculated only on those responsible and variable_name who have at least 50 records.
# Once it is calculated, values that diverge of more than 50% from the median value get marked as "anomalus."
benford_jensen_df = apply_benford_tests(df, valid_variables, 'responsible', feature_name, apply_first_digit=True, minimum_sample=50)

variable_list = benford_jensen_df['variable_name'].unique()
for var in variable_list:

    bj_mask = (benford_jensen_df['variable_name']==var) &  (~pd.isnull(benford_jensen_df[feature_name]))
    bj_df = benford_jensen_df[bj_mask].copy()
    if bj_df.shape[0] > 0:
        bj_df.sort_values(feature_name, inplace=True, ascending=True)

        median_value = bj_df[feature_name].median()
        bj_df[score_name] = bj_df[feature_name].apply(lambda x: 1 if x> median_value+50/100*median_value else 0)
        
        df.loc[df['variable_name']==var, score_name] = df[df['variable_name']==var]['responsible'].map(bj_df.set_index('responsible')[score_name])
        
        
        anomaly_df = bj_df[bj_df[score_name] == 1]
        no_anomaly_df = bj_df[bj_df[score_name] == 0]
        plt.bar(no_anomaly_df['responsible'], no_anomaly_df[feature_name], color='blue', label='No Anomaly')
        plt.bar(anomaly_df['responsible'], anomaly_df[feature_name], color='red', label='Anomaly')
        plt.xticks(rotation=90)
        plt.title(var)
        plt.show()

# First Digit Responsible Level Processing

In [None]:
# Plot the average number of anomalies on a first digit by responsible
data = df.groupby(['responsible','variable_name']).agg({score_name: 'mean'})
data = data.reset_index()
entropy_ = data.groupby('responsible')[score_name].mean()
entropy_.plot(kind='bar')
plt.show()

In [None]:
benford_jensen_df = apply_benford_tests(df, valid_variables, 'responsible', feature_name, apply_first_digit=False, minimum_sampe=1)
benford_jensen_df