In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%load_ext memory_profiler

In [None]:
import pandas as pd
from utils.import_utils import *
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from src.feature_processing import *
from src.unit_proccessing import  *
from src.utils.stats_utils import *
import plotly.express as px
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
with initialize(config_path='../configuration', version_base='1.1'):
    config = compose(config_name='main.yaml')

In [None]:
features_class = UnitDataProcessing(config)

In [None]:
df_paradata = features_class.df_paradata

In [None]:
df_item = features_class.df_item
self = features_class

In [None]:
df_unit = features_class.df_unit

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, AffinityPropagation, MeanShift
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

def clustering_comparison(df):
    results = {}
    
    # K-Means
    kmeans = KMeans(n_clusters=2)  # You might want to change the number of clusters based on domain knowledge
    kmeans_labels = kmeans.fit_predict(df)
    results['KMeans'] = silhouette_score(df, kmeans_labels)
    
    # DBSCAN
    dbscan = DBSCAN()
    dbscan_labels = dbscan.fit_predict(df)
    # Compute silhouette score only if more than one cluster is identified
    if len(np.unique(dbscan_labels)) > 1:
        results['DBSCAN'] = silhouette_score(df, dbscan_labels)
    
    # Agglomerative Hierarchical Clustering
    agglomerative = AgglomerativeClustering(n_clusters=2)  # Adjust number of clusters as needed
    agg_labels = agglomerative.fit_predict(df)
    results['Agglomerative'] = silhouette_score(df, agg_labels)
    
    # Affinity Propagation
    affinity = AffinityPropagation()
    affinity_labels = affinity.fit_predict(df)
    results['AffinityPropagation'] = silhouette_score(df, affinity_labels)
    
    # Mean Shift
    mean_shift = MeanShift()
    mean_shift_labels = mean_shift.fit_predict(df)
    # Compute silhouette score only if more than one cluster is identified
    if len(np.unique(mean_shift_labels)) > 1:
        results['MeanShift'] = silhouette_score(df, mean_shift_labels)
    
    # Gaussian Mixture Model (GMM)
    gmm = GaussianMixture(n_components=2)  # Adjust number of components as needed
    gmm_labels = gmm.fit_predict(df)
    results['GMM'] = silhouette_score(df, gmm_labels)
    
    # Convert results to a DataFrame for better visualization
    results_df = pd.DataFrame.from_dict(results, orient='index', columns=['Silhouette Score'])
    
    return results_df.sort_values(by='Silhouette Score', ascending=False)

# Testing the function
# df = df_unit_score[score_columns].copy()
# #df = pd.DataFrame(scaler.fit_transform(df), columns=score_columns)
# # df = pd.DataFrame(...)  # Your data here
# df = df.fillna(0)
# print(clustering_comparison(df))


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.ensemble import IsolationForest

def compare_clustering_algorithms(data):
    # Scaling the data
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data)
    
    # Defining the clustering algorithms
    algorithms = {
        'KMeans': KMeans(n_clusters=2), # You might want to find the optimal number of clusters first
        'Agglomerative': AgglomerativeClustering(n_clusters=2),
        'DBSCAN': DBSCAN(eps=0.5, min_samples=5),
        'Isolation Forest': IsolationForest(contamination=0.2) # Contamination is the proportion of outliers in the data set
    }
    
    # Applying the clustering algorithms and getting silhouette scores
    silhouette_scores = {}
    for name, algo in algorithms.items():
        if name == "Isolation Forest":
            # -1 for outliers, 1 for inliers -> transforming to 0 for inliers, 1 for outliers
            labels = (algo.fit_predict(data_scaled) == -1).astype(int)
        else:
            labels = algo.fit_predict(data_scaled)
            
        # Calculating silhouette score (note: silhouette score is higher for better clusters)
        score = silhouette_score(data_scaled, labels)
        silhouette_scores[name] = score
    
    return silhouette_scores






In [None]:
from sklearn.metrics import adjusted_rand_score
from sklearn.cluster import KMeans
import numpy as np

def clustering_stability(data, model, perturbation_factor=0.01, random_seed=None):
    """
    Check the stability of a clustering model by applying perturbation to the data.
    
    Parameters:
    - data: The dataset to be clustered.
    - model: A clustering model that has fit and predict methods, e.g., KMeans from sklearn.
    - perturbation_factor: A small multiplier for the random noise added to the data.
    - random_seed: Optional seed for reproducibility.
    
    Returns:
    - ARI value: A value between -1 and 1. Values close to 1 indicate high stability.
    """
    np.random.seed(random_seed)
    
    # Fit the model on the original data
    original_labels = model.fit_predict(data)
    
    # Add small noise to the data
    perturbed_data = data + perturbation_factor * np.random.randn(*data.shape)
    
    # Fit the model on the perturbed data
    perturbed_labels = model.fit_predict(perturbed_data)
    
    # Compute Adjusted Rand Index to check stability
    ari = adjusted_rand_score(original_labels, perturbed_labels)
    
    return ari

In [None]:
columns = ['s__answer_changed',
       's__answer_duration_lower_outliers',
       's__answer_duration__upper_outliers', 's__answer_position',
       's__answer_removed', 's__answer_selected', 's__answer_time_set',
       's__first_decimal', 's__first_digit', 's__proximity_counts',
       's__spatial_outlier', 's__gps', 's__multi_option_question',
       's__number_answered', 's__number_unanswered', 's__pause_count',
       's__pause_duration', 's__sequence_jump', 's__single_question',
       's__time_changed', 's__total_duration', 's__total_elapse' ]


# columns = ['s__answer_changed',
#        's__answer_duration_lower_outliers',
#        's__answer_duration__upper_outliers', 
#        's__answer_removed', 's__answer_selected', 's__answer_time_set',
#        's__first_decimal', 's__first_digit', 's__proximity_counts',
#        's__spatial_outlier', 's__gps'
#        , 's__number_unanswered', 's__pause_count',
#        's__pause_duration', 's__sequence_jump', 
#        's__time_changed', 's__total_elapse' ]

scaler = StandardScaler()
df = self.df_unit_score[columns].copy()
df = df.fillna(-1)
df = pd.DataFrame(scaler.fit_transform(df), columns=columns)
X = df.copy()

In [None]:
#print(clustering_comparison(df))
print(compare_clustering_algorithms(df))

In [None]:
for col in columns:
    self.df_unit_score[col].hist()
    plt.title(col)
    plt.show()

In [None]:
kmeans = KMeans(n_clusters=2)
stability_score = clustering_stability(X, kmeans,perturbation_factor=0.1)

print(f"Stability Score (ARI): {stability_score:.4f}")

In [None]:
kmeans = KMeans(n_clusters=2)  # You might want to change the number of clusters based on domain knowledge
kmeans_labels = kmeans.fit_predict(X)
df['score'] = kmeans_labels
y = kmeans_labels
print(df['score'].value_counts(), df['score'].value_counts()/df['score'].count(), silhouette_score(df, kmeans_labels))

In [None]:
import shap

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.3)

In [None]:
rf = RandomForestRegressor(max_depth=6, random_state=0, n_estimators=10)
rf.fit(X_train, Y_train)  
print(rf.feature_importances_)
importances = rf.feature_importances_
indices = np.argsort(importances)
features = X_train.columns
plt.title('Feature Importance')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

In [None]:
# write documentation on faking interview
# The idea it would be to create a platform that compressed in anomised form the data with the feedback on the "fraud" to improve classification in the future

# Take the case of one very long street household, how the Gps anomaly would act in the case?
