In [None]:
import os
import re
import shap
import time
import pickle
import openai
import textstat
import numpy as np
import pandas as pd
import missingno as msno
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from plotly.colors import n_colors
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, RobustScaler
from xgboost import XGBRegressor, XGBClassifier
from statsmodels.nonparametric.smoothers_lowess import lowess
from scipy.stats import kruskal, shapiro, spearmanr, kendalltau

In [None]:
# The significance level indicates the probability of rejecting the null hypothesis when it is true.
alpha = 0.05

random_state = 42

shap.initjs()

pd.set_option("display.max_rows", None, "display.max_columns",
              None, 'display.max_colwidth', None)

os.environ["TOKENIZERS_PARALLELISM"] = "true"

openai.api_key = os.getenv(
    'OPENAI_API_KEY', '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')


In [None]:
path_dataset = os.path.join(os.path.dirname(os.getcwd()), 'Dataset')

path_result = os.path.join(os.path.dirname(os.getcwd()), 'Result')
if not os.path.exists(path_result):
    os.makedirs(path_result)

path_general = os.path.join(path_result, 'General')
if not os.path.exists(path_general):
    os.makedirs(path_general)

path_challenge = os.path.join(path_result, 'Challenge')
if not os.path.exists(path_challenge):
    os.makedirs(path_challenge)

path_prevalence = os.path.join(path_challenge, 'Prevalence')
if not os.path.exists(path_prevalence):
    os.makedirs(path_prevalence)

path_difficulty = os.path.join(path_challenge, 'Difficulty')
if not os.path.exists(path_difficulty):
    os.makedirs(path_difficulty)

path_evolution = os.path.join(path_challenge, 'Evolution')
if not os.path.exists(path_evolution):
    os.makedirs(path_evolution)

path_cardsorting = os.path.join(path_challenge, 'Card Sorting')
if not os.path.exists(path_cardsorting):
    os.makedirs(path_cardsorting)

path_anomaly = os.path.join(path_cardsorting, 'Anomaly')
if not os.path.exists(path_anomaly):
    os.makedirs(path_anomaly)

path_root_cause = os.path.join(path_cardsorting, 'Root Cause')
if not os.path.exists(path_root_cause):
    os.makedirs(path_root_cause)

path_solution = os.path.join(path_cardsorting, 'Solution')
if not os.path.exists(path_solution):
    os.makedirs(path_solution)

path_inquiry = os.path.join(path_cardsorting, 'Inquiry')
if not os.path.exists(path_inquiry):
    os.makedirs(path_inquiry)

In [None]:
def nan_helper(y):
    """Helper to handle indices and logical indices of NaNs.

    Input:
        - y, 1d numpy array with possible NaNs
    Output:
        - nans, logical indices of NaNs
        - index, a function, with signature indices= index(logical_indices),
          to convert logical indices of NaNs to 'equivalent' indices
    Example:
        >>> # linear interpolation of NaNs
        >>> nans, x= nan_helper(y)
        >>> y[nans]= np.interp(x(nans), x(~nans), y[~nans])
    """
    return np.isnan(y), lambda z: np.nonzero(z)[0]


def extrapolate_nans_1d(y):
    """Helper to extrapolate 1d array with NaNs.
    """
    nans, x = nan_helper(y)
    y[nans] = np.interp(x(nans), x(~nans), y[~nans])
    return y