In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%load_ext memory_profiler

In [None]:

from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from src.item_processing import  *
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
with initialize(config_path='../configuration', version_base='1.1'):
    config = compose(config_name='main.yaml')

In [None]:

features_class = ItemFeatureProcessing(config)

In [None]:
df_item = features_class.df_item
self = features_class

In [None]:
feature_name = 'f__first_decimal'
score_name = 's__first_decimal'
df = df_item[~pd.isnull(df_item[feature_name])].copy()
# Select only those variables that have at least three distinct values and more than one hundred records

valid_variables =  self.filter_variable_name_by_frequency(df, feature_name, frequency=100, min_unique_values=3)


In [None]:
for index_range in range(0, len(valid_variables), 50):
    variables = valid_variables[index_range:index_range+50]
    plt.figure(figsize=(15, 6))
    sns.boxplot(df[df['variable_name'].isin(variables)],x='variable_name', y=feature_name)
    plt.xticks(rotation=90)
    plt.show()

In [None]:
for var in valid_variables:
    mask = (df['variable_name'] == var)
    contamination = self.get_contamination_parameter(feature_name, method='medfilt', random_state=42)    
    model = COF(contamination=0.11)
    model.fit(df[mask][[feature_name]])
    df.loc[mask, score_name] = model.predict(df[mask][[feature_name]])
    
    data_true = df[(df[score_name]==0) & mask][feature_name]
    data_false = df[(df[score_name]==1) & mask][feature_name]

    bins = np.histogram_bin_edges(df[mask][feature_name], bins=10)
    plt.hist(data_true, bins=bins, alpha=0.5, color='blue', label='True')
    plt.hist(data_false, bins=bins, alpha=0.5, color='red', label='False')
    plt.title(var)
    plt.show()

In [None]:
for index_range in range(0, len(valid_variables), 50):
    variables = valid_variables[index_range:index_range+50]
    plt.figure(figsize=(15, 6))
    sns.boxplot(df[df['variable_name'].isin(variables)],x='variable_name', y=feature_name, hue=score_name)
    plt.xticks(rotation=90)
    plt.show()

# First Decimal UNIT Level Processing

In [None]:
data = df.groupby(['interview__id']).agg({score_name: 'mean'})
data = data.reset_index()
data[score_name].hist()
plt.title(score_name)
plt.show()

In [None]:
total_unit = data['interview__id'].count()
mean_value1 = data[score_name].mean()
mean_value2 = data[score_name].mean()
print(f" Total UNITS: {total_unit}, with an average of anomalies in selected items {mean_value1}")

In [None]:
data = df.groupby(['interview__id','responsible']).agg({score_name: 'mean'})
data = data.reset_index()

resp_df = {}
for resp in data['responsible'].unique():
    mask = (data['responsible']==resp)

    total_unit = data[mask]['interview__id'].count()
    mean_value1 = data[mask][score_name].mean()
    resp_df[resp] = [mean_value1]
    print(f"{resp} - Total UNITS: {total_unit}, with an average of anomalies in selected items {mean_value1}")

resp_df = pd.DataFrame.from_records(resp_df).T
resp_df = resp_df.reset_index()
resp_df.columns = ['responsible', 'mean_value1']
resp_df.set_index('responsible')['mean_value1'].plot(kind='bar')
plt.title(score_name)
