In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

figure(figsize=(50,40), dpi=300)
plt.rcParams["figure.figsize"] = (50,40)


from lifelines import KaplanMeierFitter, NelsonAalenFitter
from sklearn.cluster import KMeans
import datetime as dt
import re, os, random
import joblib, glob


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import graphviz 
from sklearn.tree import export_graphviz


from sksurv.tree import SurvivalTree
from sksurv.util import Surv
from tree_exporter import plot_tree
from sklearn import tree
from sklearn.tree import _tree


from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc
from scipy.stats import skew, kurtosis, entropy


np.random.seed(122)
random.seed(122)

import warnings
warnings.filterwarnings("ignore")

In [None]:
def survive(data, duration, event, fitter='kaplan'):
    kmf = KaplanMeierFitter() if fitter == 'kaplan' else NelsonAalenFitter()
    kmf.fit(durations=data[duration], event_observed=data[event])
    print(kmf.event_table)
    return kmf, kmf.durations

In [None]:
def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    feature_names = [f.replace(" ", "_")[:-5] for f in feature_names]
    print("def predict({}):".format(", ".join(feature_names)))
    count = 0
    def recurse(node, depth):
        indent = "|    " * depth
        nonlocal count
        count += 1
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print("{}if {} <= {}:".format(indent, name, np.round(threshold,2)))
            recurse(tree_.children_left[node], depth + 1)
            print("{}else:  # if {} > {}".format(indent, name, np.round(threshold,2)))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print("{}return {} ".format(indent, tree_.n_node_samples[node]), '->', count - 1)

    recurse(0, 1)

In [None]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    shc.dendrogram(linkage_matrix, **kwargs)

In [None]:
def predect_node_for_missing_value(model, data): # this is a brute force solution. I beleive there is a better one can be found
    """This function build the tree structure then select a node for it iff it has missing values """
    if any(elem is None for elem in data) or any(elem is np.nan for elem in data):
        pass
    else: 
        return 0, "This person doesn't have any missing values"
    
    n_nodes = model.tree_.node_count
    children_left = model.tree_.children_left
    children_right = model.tree_.children_right
    feature = model.tree_.feature
    name = model.feature_names_in_
    threshold = model.tree_.threshold
    samples = model.tree_.n_node_samples

    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, 0)]  # start with the root node id (0) and its depth (0)
    while len(stack) > 0:
        # `pop` ensures each node is only visited once
        node_id, depth = stack.pop()
        node_depth[node_id] = depth

        # If the left and right child of a node is not the same we have a split
        # node
        is_split_node = children_left[node_id] != children_right[node_id]
        # If a split node, append left and right children and depth to `stack`
        # so we can loop through them
        if is_split_node:
            stack.append((children_left[node_id], depth + 1))
            stack.append((children_right[node_id], depth + 1))
        else:
            is_leaves[node_id] = True

    print( "The binary tree structure has {n} nodes, {l} leaves and has the following tree structure:\n".format(n=n_nodes, l=np.sum(is_leaves)))
    
    current_node = 0
    for i in range(n_nodes):
        print('current_node is', current_node, end=' ::: ')
        if is_leaves[i]:
            print("{space}node={node} is a leaf node.".format(space=node_depth[i] * "\t", node=i))
        else:
            if data[feature[i]] is np.nan or data[feature[i]] is None:
                if current_node == i:
                    print(f"{node_depth[i] * '    '}node={i} is a split node: go to node {children_left[i]} if node {children_left[i]} counts {samples[children_left[i]]} >= node{children_right[i]} counts{samples[children_right[i]]} else to node {children_right[i]}.")
                    if samples[children_left[i]] >= samples[children_right[i]]:
                        current_node = children_left[i]
                    elif samples[children_right[i]] > samples[children_left[i]]:
                        current_node = children_right[i]
            else:
                if current_node == i:
                    print(f"{node_depth[i] * '    '}node={i} is a split node: go to node {children_left[i]} if X[:, {feature[i]} {name[feature[i]]}] <= {threshold[i]} else to node {children_right[i]}.")
                    if data[feature[i]] <= threshold[i]:
                        current_node = children_left[i]
                        
                    elif data[feature[i]] > threshold[i]:
                        current_node = children_right[i]
        
        if is_leaves[current_node]:
            print(f'Node {current_node} is a leaf node. Its the node where this person will land')
            return current_node, np.argmax(model.tree_.value[current_node])
        
    return current_node, np.argmax(model.tree_.value[current_node])

In [None]:
METHOD1 = 'METHOD1'
METHOD2 = 'METHOD2'
METHOD3 = 'METHOD3'
MALE = 'male'
FEMALE = 'female'
CONTRIBUTORS = 'CONTRIBUTORS'
TARGET = 'TARGET'
MODEL = 'model'

In [None]:
ID = 'national_id_number'
LAST_JOB = 'last_job_c'
FIRST_JOB = 'first_job'
EXPERIENCE = 'experience'
AGE = 'age'
GOVERNORATE = 'governorate'
DISABILITY = 'disability'
GENDER = 'gender'
EDUCATION = 'education'
UNEMPLOYMENT_YAER = 'unemployment_year'
SAME_JOB = 'same_job'
WAGE = 'wage_adj_c'
POVERTY = 'poverty'
INDUSTRY = 'industry'
SPELL = 'unemployment_spell'
# EMPLOYMENT = 'employment'


NODES = 'nodes'
CLUSTER = 'clusters'
DURATIONS = 'durations'
NODES_DESTRO = 'nodes_destros'

In [None]:
ORIGINAL_COLUMNS = ['NationalID_Number', 'LastJobC' ,'FirstJob', 'experience', 'age', 'Governorate','Disability', 'Gender', 'education','UnemploymentYear','SameJob', 'wage_adj_c','Poverty', 'Industry','UnemploymentSpell']
PROCESSED_COLUMNS = [ID, LAST_JOB, FIRST_JOB, EXPERIENCE, AGE, GOVERNORATE, DISABILITY, GENDER, EDUCATION, UNEMPLOYMENT_YAER, SAME_JOB, WAGE, POVERTY, INDUSTRY, SPELL]

ATTRIBUTES = [EXPERIENCE, AGE, GOVERNORATE, DISABILITY, EDUCATION, GENDER]

ONE_HOT_ENCODED_FEATURES = [GOVERNORATE, EDUCATION,] #  EXPERIENCE, AGE, 
LABEL_ENCODED_FEATURES = [GENDER, DISABILITY]

ENCODERS = {}
API_ENCODERS = {}

MAX_NODES_COUNT = None
#################################################################### LAST MODIFY 600 --> 50
# MINIMUM_LEAF_COUNT = 150
MINIMUM_LEAF_COUNT = 200
MINIMUM_DURATION_CUT = 60
MAXIMUM_DURATION_CUT = 1800

In [None]:
METHODS = {
    METHOD1:{
        CONTRIBUTORS: [EXPERIENCE, AGE, GOVERNORATE, DISABILITY, EDUCATION, INDUSTRY,],
        TARGET:f'{SPELL}',
        MODEL:'DecisionTreeRegressor()'
    },
    METHOD2:{
        CONTRIBUTORS: [EXPERIENCE, AGE, GOVERNORATE, DISABILITY, EDUCATION, INDUSTRY,],
        TARGET:f"Surv.from_dataframe({LAST_JOB}, {SPELL}," + " {0})",
        MODEL:f'SurvivalTree(min_samples_leaf={MINIMUM_LEAF_COUNT})' 
    },
    METHOD3:{
        CONTRIBUTORS: [EXPERIENCE, AGE, GOVERNORATE, DISABILITY, EDUCATION,],
        TARGET:f"Surv.from_dataframe('{LAST_JOB}', '{SPELL}'," + " {0})",
        # MODEL:f'SurvivalTree(min_samples_leaf={MINIMUM_LEAF_COUNT}, max_leaf_nodes={MAX_NODES_COUNT})'
        MODEL:f'SurvivalTree(min_samples_leaf={MINIMUM_LEAF_COUNT}, )'
    },
}

In [None]:
DATA_INPUT_PATH = os.path.join('.','data','Unemployment Data.dta')
DATA_OUTPUT_PATH = os.path.join('.','data','final_outputs.csv')
MODELS_OUTPUT_PATH = os.path.join('.','runs')

In [None]:
df = pd.read_stata(DATA_INPUT_PATH) # [['experience', 'Governorate', 'Name_tr', 'Disabled_tr', 'EducationalAttainment', 'JobSeekers_DateOfBirth', 'job_search_start']]
df.head()

In [None]:
# df.NationalID_Number = df.NationalID_Number.astype(str)
max_date = df.end_date.max()
con = np.logical_and(
    df.NationalID_Number == df.groupby(['NationalID_Number'])['NationalID_Number'].shift(1),
    df.reason_suspension_tr.isin(['Resignation', 'Laid off'])
)
df['RegisterAfterFired'] = np.where(con, 1, 0)
# NEEDS MORE ATTINTION
if 'METHOD3' == METHOD3:
    con = np.logical_and(
        df.last_ind == 1,
        df.reason_suspension_tr.isin(['Resignation', 'Laid off'])
    )
    df['LastJobC'] = np.where(con, 0, 1)
    
    # data$LastJobC[data$`_merge_with_mol`==2]         = 0
    # data$experience[data$`_merge_with_mol`==2]       = 0
    # data$end_date[data$`_merge_with_mol`==2]         = data$RegistrationdateintoNEES[data$`_merge_with_mol`==2]
    # data$econ_activity_tr[data$`_merge_with_mol`==2] = ''
    
    df_temp = df[df.last_ind == 1]
    # data$LastJobC               = 1
    df_temp.unempl_spell = pd.to_datetime(max_date) - pd.to_datetime(df_temp.end_date)
    df_temp.RegisterAfterFired = 1
    
    # data= rbind(data,dataTemp)  # this will create duplicate
df['validCase'] = np.where(df.first_ind == 1, 1, df.RegisterAfterFired)
con = np.logical_and(
    df.validCase == 1,
    np.logical_not(df.econ_activity_tr.isin(['Public administration, defense, and social security']))
)
df1 = df[con]
df1 = df1.dropna(subset=['unempl_spell'])
con = np.logical_and(
    df1.unempl_spell > MINIMUM_DURATION_CUT,
    df1.unempl_spell < MAXIMUM_DURATION_CUT,
)
df1 = df1[con]
df1.info() # this only what left after filtaring

In [None]:
def experience_code(value):
    if value > 15:
        return 20
    elif value > 10:
        return 15
    elif value > 5:
        return 10
    elif value > 1:
        return 5
    elif value > 0:
        return 1
    else:
        return 0
    

# - Dummy for first job:
df1['FirstJob'] = np.where(df1.first_ind == 1, 1, 0)

# #  - Experience:
df1.experience = np.round(df1.experience / 365)
df1.experience = df1.experience.apply(experience_code)

# - Age:
df1['age'] = (df1.job_search_start - dt.datetime(1970, 1, 1)).dt.total_seconds().astype(int) - (df1.JobSeekers_DateOfBirth - dt.datetime(1970, 1, 1)).dt.total_seconds().astype(int)
df1['age'] = (np.round((df1['age'] / (60 * 60 * 24 * 365)) / 10) * 10).astype(int)

# - Governorate:
df1.Governorate = df1.Governorate.str.lower()

# - Disability:
df1['Disability'] = df1.Disabled_tr.str.lower()

# - Gender:
df1['Gender'] = df1.Name_tr.str.lower()


# - Education Level:
df1.EducationalAttainment[df1.EducationalAttainment.isin(["High Diploma", "Bachelor", "Masters", "PhD"])] = 'bachelor_or_above'
df1.EducationalAttainment[df1.EducationalAttainment.isin(["Vocational Training"])] = "Vocational Training"
df1.EducationalAttainment[df1.EducationalAttainment.isin(["Middle Diploma"])] = "Middle Diploma"
df1.EducationalAttainment[df1.EducationalAttainment.isin(["Secondary or Below"])] = "Secondary or Below"
df1 = df1.rename(columns = {'EducationalAttainment':'education'})

# - Year of unemployment:
df1['UnemploymentYear'] = 0


# - Same Job:
df1['SameJob'] = np.where(df1.rep_job == 1, 'Yes', 'No')

# - Poverty:
df1['Poverty'] =  df1.poverty_score.fillna(0)

# - Industry:
df1['Industry'] = df1.econ_activity_tr.str.lower()
if('METHOD2' == METHOD2): 
    df1['Industry'] =  df1['Industry'].replace(r'^\s*$', np.nan, regex=True)

# - Unemployment Spell:
df1['UnemploymentSpell'] = df1.unempl_spell / 30 # Measured in months.


# - Wages:
df1['wage_adj_c'] = df1[['wage_adj']].fillna(-1000).astype(np.int)
t = KMeans(n_clusters=4, init='k-means++', n_init=10).fit(df1[['wage_adj_c']])
# clustered_data = t.fit_transform(df1[['wage_adj_c']])
df1['wage_adj_c'] = df1[['wage_adj_c']].groupby(t.labels_).transform('mean').sum(1).rank(method='dense').sub(1).astype(np.int64).to_frame()
# clusters_mean = []
# for i in range(t.n_clusters):
#     clusters_mean.append(clustered_data[:, i].mean())
#     print(clusters_mean)

print(df1.info())
df1.head()

In [None]:
# for i in ORIGINAL_COLUMNS:
#     if i not in list(df1.columns):
#         print(i)
fdf = df1[ORIGINAL_COLUMNS].dropna()

#################################################################### LAST MODIFY
# con = np.logical_and(
#     fdf.Governorate != 'zarqa',
#     np.logical_and(
#         fdf.Governorate != 'amman',
#         fdf.Governorate != 'irbid',
#     )
# )
# fdf.loc[con, 'Governorate'] = 'other'



# con = np.logical_and(
#     fdf.education != 'Secondary or Below',
#     fdf.education != 'bachelor_or_above',
# )
# fdf.loc[con, 'education'] = 'other'
####################################################################


print(fdf.info())
print('-' * 100)
for col in fdf.columns:
    print(col, '----------->', len(fdf[col].unique()))
    print(fdf[col].unique(), '\n\n')

In [None]:
def unify_column_names(df):
    cols = []
    for col in df.columns:
        col = re.sub(r"(\w)([A-Z])", r"\1 \2", col)
        cols.append(col.strip(' .()[]{}/\#@*^!?').replace('_', ' ').replace(' ', '_').replace(',', '_').replace('__', '_').lower())
    df.columns = cols
    return df


def format_string_data(df):
    # unify dealing with strings data
    for col in df.select_dtypes(include=['object']):
        df[col] = df[col].str.replace(' ', '_')
        df[col] = df[col].str.replace('-', '_')
        df[col] = df[col].str.replace(',', '_')
        df[col] = df[col].str.replace('__', '_').str.lower()
    return df

def build_encoders(df, encoder_dict, save=False, label_encoded=LABEL_ENCODED_FEATURES, one_hot_encoded=ONE_HOT_ENCODED_FEATURES):
    for col in df.select_dtypes(include=['object']):
        if col in one_hot_encoded:
            
            encoder = OneHotEncoder(sparse=False)
            temp = pd.DataFrame(
                encoder.fit_transform(df[[col]]),
                columns=list(encoder.get_feature_names_out())
            )
            df = pd.concat([df, temp], axis=1).drop(col, axis=1)

        elif col in label_encoded:
            encoder = LabelEncoder()
            df[col] = encoder.fit_transform(df[col])


        else:
            raise ValueError(f"'{col}' can't be encoded.")
        
        if save:
            encoder_dict[col] = encoder
    #     print(col)
    #     print(df.head())
    #     print('-'*100, 'Next Item')
    return df
    
def advanced_processing(data, encode=False, save=False):
    data = data.replace(r'^\s*$', np.nan, regex=True).dropna() # drop any record with nulls
    data = data.reset_index(drop=True)

    data = format_string_data(unify_column_names(data))
    print(data.info())
    
    data = data.reset_index(drop=True)
    if encode:
        data = build_encoders(data, ENCODERS, save=save)
        print(data.info())
        
    return data


fdf = unify_column_names(fdf)
fdf.drop(fdf[fdf[SPELL].isnull()].index, inplace=True)
fdf

# data = fdf.replace(r'^\s*$', np.nan, regex=True).dropna() # drop any record with nulls
# print(data.info())
# data = unify_column_names(data)
# print(data.info())
# data = format_string_data(data)
# print(data.info())
# data = data.reset_index(drop=True)
# print(data.info())
# build_encoders(data[METHODS[METHOD3][CONTRIBUTORS]])

In [None]:
def build_and_predict(X, y, method=METHOD3):
    print(X.shape, y.shape)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y)
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

    # for i in range(2, len(X.columns)):
    #     print("-"*80, i, '-'*80)
    model = eval(METHODS[method][MODEL])
    model.fit(X_train, y_train)
    
    print('Accuracy for training:',model.score(X_train, y_train))
    print('Accuracy for testing:',model.score(X_test, y_test))
    return model #, graph

In [None]:
advanced_processing(fdf[ATTRIBUTES], encode=True, save=True)

In [None]:
def scale_aggre(data):
    data = data.copy()
    data.columns = data.columns.droplevel()
    display(data)
    for col in data.columns:
        data[col] = (data[col] - data[col].mean()) / data[col].std()
    # x = data.transpose()#.reset_index(drop=True)
    # return ((x - x.mean()) / x.std()).transpose()
    # return (data - data.mean()) / data.std()
    # return (data - data.mean()) / data.std()
    return data

# def swap(a, target):
#     return target

In [None]:
NODES_APPLY = 'nodes_apply'
genders = [MALE, FEMALE]
methods = [METHOD3] # METHOD1, METHOD2,

per25 = lambda x: pd.Series(x, name='p25').quantile(0.25)
per75 = lambda x: pd.Series(x, name='p25').quantile(0.75)



if os.path.exists(DATA_OUTPUT_PATH):
    os.remove(DATA_OUTPUT_PATH)

for gen in genders:
    for method in methods:
        temp = advanced_processing(fdf[fdf[GENDER] == gen][METHODS[method][CONTRIBUTORS]], encode=False).reset_index(drop=True)
        data = advanced_processing(fdf[fdf[GENDER] == gen][METHODS[method][CONTRIBUTORS]], encode=True,).reset_index(drop=True)
        # display(data)
        
        target = eval(METHODS[method][TARGET].format('fdf[fdf[GENDER] == gen]')) if '(' in METHODS[method][TARGET] else METHODS[method][TARGET]
        
        print(len(target))

        model = build_and_predict(X=data, y=target, )
        print("The total number of leafs is:",  model.tree_.n_leaves)
        print("Avg count per leaf:", np.mean(model.tree_.n_node_samples))
        
        print('-'*150 )
        print('Tree Rules...')
        print(tree_to_code(model, model.feature_names_in_))
        print('-'*150 )
        
        graph = plot_tree(
            model,
            feature_names=model.feature_names_in_,
            impurity=False,
            label="all",
            fontsize=20,
            filled=True,
            # proportion=True,
            node_ids=True, 
            class_names=['0', '1', '2'],
        )
        
        joblib.dump(model, os.path.join(MODELS_OUTPUT_PATH, f'{gen}_surv_tree_model.joblib'))
        
        ############## This is last modify, it is a replacment for the predict, it will output the node directly instead of the node_distro ##############
        data[NODES] = model.tree_.apply(data.to_numpy().astype('float32')) 
        
        
        # data[NODES_DESTRO] = np.round(model.predict(data), decimals=8).astype(object)
        # display(data[NODES_DESTRO])
        
        # encoder = LabelEncoder()
        # data[NODES] = encoder.fit_transform(data[NODES_DESTRO])
        # data[NODES_DESTRO] = data[NODES_DESTRO].astype(np.float32)
        
        # ENCODERS[NODES] = encoder
        # joblib.dump(encoder,  os.path.join(MODELS_OUTPUT_PATH, f'{gen}_surv_tree_nodes_encoder.joblib'))
        

    data[SPELL] = fdf[fdf[GENDER] == gen][SPELL].reset_index(drop=True)
    data[LAST_JOB] = fdf[fdf[GENDER] == gen][LAST_JOB].reset_index(drop=True)
    # display(data)
        
    data[DURATIONS] = survive(fdf[fdf[GENDER] == gen].drop(GENDER, axis=1), duration=SPELL, event=LAST_JOB)[-1] 
    data.reset_index(drop = True, inplace=True)
    
    
    aggregated_df = data[[SPELL, NODES]].groupby(NODES).agg([np.min, np.median, np.std, np.max, np.mean, kurtosis, skew, entropy, per25, per75]) 
    aggregated_df = scale_aggre(aggregated_df) ############################### did not give the hoped results
    # display(aggregated_df)
        
    cluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward', compute_full_tree=True, compute_distances=True,)
    aggregated_df[CLUSTER] = cluster.fit_predict(aggregated_df)
    display(aggregated_df)
    
    
    # joblib.dump(cluster, os.path.join(MODELS_OUTPUT_PATH, f'{gen}_ward_clustering_model.joblib'))
    aggregated_df.to_csv(os.path.join(MODELS_OUTPUT_PATH, f'{gen}_ward_clustering_data.csv'))
    
    plt.figure(figsize=(15, 9))
    plot_dendrogram(cluster, truncate_mode='lastp', distance_sort=True)
    plt.show()
    
    
    dd = {node:cluster for node, cluster in zip(aggregated_df.index, aggregated_df[CLUSTER].to_numpy()) } # aggregated_df.index is the tree correct nodes
    data[CLUSTER] = data[NODES].map(dd)
    print(dd)
    joblib.dump(dd, os.path.join(MODELS_OUTPUT_PATH, f'{gen}_cluster_node_dict_mapper.joblib'))
    
    corector = data[[SPELL, CLUSTER]].groupby(CLUSTER).mean()
    print('Clusters mean durations before mapping', corector)

    order = pd.Series(dict((v,k) for k,v in np.argsort(corector[SPELL]).sort_values().iteritems())).to_dict()
    joblib.dump(order, os.path.join(MODELS_OUTPUT_PATH, f'{gen}_cluster_node_dict_cmapper_x.joblib'))
    print(order)

    data[CLUSTER] = data[CLUSTER].map(order)
    print('Clusters mean durations after mapping:', data[[SPELL, CLUSTER]].groupby(CLUSTER).mean())  
    print('Samples within each cluster:', data[CLUSTER].value_counts())
    
    temp[GENDER] = gen
    temp[CLUSTER] = data[CLUSTER]
    temp[NODES] = data[NODES]
    temp[DURATIONS] = data[DURATIONS]
    temp[SPELL] = data[SPELL]
    temp[LAST_JOB] = data[LAST_JOB]
    
    if os.path.exists(DATA_OUTPUT_PATH):
        temp.to_csv(DATA_OUTPUT_PATH, index=False, mode='a', header=False)
    else:
        temp.to_csv(DATA_OUTPUT_PATH, index=False, mode='a')
    
    plt.figure(figsize=(15, 9))
    data[SPELL].hist(by=data[CLUSTER], bins=20, density=True, alpha=0.7, grid=True)
    plt.show()

In [None]:
print(ENCODERS)

for enc in ENCODERS:
    if enc == 'nodes':
        continue
    joblib.dump(ENCODERS[enc], os.path.join(MODELS_OUTPUT_PATH, f'{enc}_encoders.joblib') )

In [None]:
raise

In [131]:
# Attributes and contributors
GOVERNORATE = 'governorate'
AGE = 'age'
EXPERIENCE = 'experience'
EDUCATION = 'education'
GENDER = 'gender'
DISABILITY = 'disability'

# Integrity strings
NA_FILL_VALUE = 'NA_FILL_VALUE'
CATEGORIES = 'CATEGORIES'
CODE = 'CODE'

def disability_code(value):
    if value is None:
        return None
    
    
    value = str(value).lower()
    if value not in ['no_disability', 'nodisability', 'no', 'not_disabled']:
        return 'with_disability'
    return 'no_disability'


def experience_code(value):
    if value is None:
        return None
    
    value = float(value)
    if value > 15:
        return 20
    elif value > 10:
        return 15
    elif value > 5:
        return 10
    elif value > 1:
        return 5
    elif value > 0:
        return 1
    else:
        return 0


def age_code(value):
    if value is None:
        return None
    
    age = int(round(float(value) / 10) * 10)
    if age > 60:
        return 60
    elif age < 20:
        return 20
    else:
        return age



def education_code(value):
    if value is None:
        return None

    value = str(value).lower()
    education_subs = {
        'bachelor_or_above': [
            'bachelor_or_above', 'bachelor', 'bachelors', "bs", "b.s", "bas",
            'master', 'masters', 'm.s', 'ms', 'phd', 'doctor_of_philosophy', 'doctorate', 'doctorates'
        ],
        'vocational_training': [
            "vocational_training", 'vt', 'v.t'
        ],
        'middle_diploma': [
            "middle_diploma", 'diploma' "high_diploma", 'deploma', "high_deploma", "middle_deploma",
        ],
        'secondary_or_below': [
            "secondary_or_below", 'high_school', 'school', 'secondary', 'secondary_school'
        ],
    }
    for key in education_subs:
        if value in education_subs[key]:
            return key

    return TEMPLATE_DATA_CATEGORIES[EDUCATION][NA_FILL_VALUE]


def governorate_code(value):
    if value is None:
        return None

    value = str(value).lower()  # .replace(f'governorate_', '')
    governorate_subs = {
        'al_kirk': ['al_kark', 'al_kirk', 'kark', 'kirk', ],
        'balqa': ['balqa', 'al_balqa', 'balqaa', 'al_balqaa', ],
        'tafileh': ['tafileh', 'al_tafileh', ],
        'jarash': ['jarash', 'jerash'],
        'zarqa': ['zarqa', 'al_zarqa'],
        'amman': ['amman', 'aman'],
        'al_mafraq': ['al_mafraq', 'mafraq'],
        'maan': ['maan', ],
        'irbid': ['irbid', 'irbed'],
        'al_aqaba': ['al_aqaba', 'aqaba'],
        'maadaba': ['maadaba', 'madaba'],
        'ajloun': ['ajloun'],
    }
    for key in governorate_subs:
        if value in governorate_subs[key]:
            return key
    return TEMPLATE_DATA_CATEGORIES[GOVERNORATE][NA_FILL_VALUE]


def gender_code(value):
    if value is None:
        return None

    value = str(value).lower()
    gender_subs = {
        'male': ['male', 'm', 'man'],
        'female': ['female', 'f', 'woman'],
    }
    for key in gender_subs:
        if value in gender_subs[key]:
            return key
    return TEMPLATE_DATA_CATEGORIES[GENDER][NA_FILL_VALUE]


TEMPLATE_DATA_CATEGORIES = {
    DISABILITY: {
        CATEGORIES: ['with_disability', 'no_disability'],
        # CODE: "utils.disability_code({0})",
        CODE: "disability_code({0})",
        NA_FILL_VALUE: 'no_disability'
    },
    EXPERIENCE: {
        CATEGORIES: [0, 1, 5, 10, 15, 20],
        # CODE: "utils.experience_code({0})",
        CODE: "experience_code({0})",
        NA_FILL_VALUE: 0
    },
    AGE: {
        CATEGORIES: [10, 20, 30, 40, 50, 60],
        # CODE: "utils.age_code({0})",
        CODE: "age_code({0})",
        NA_FILL_VALUE: 30
    },

    EDUCATION: {
        CATEGORIES: ['bachelor_or_above', 'vocational_training', 'middle_diploma', 'secondary_or_below'],
        # CODE: "utils.education_code({0})",
        CODE: "education_code({0})",
        NA_FILL_VALUE: 'secondary_or_below'
    },
    GOVERNORATE: {
        CATEGORIES: [
            'ajloun', 'al_aqaba', 'al_kirk',
            'al_mafraq', 'amman', 'balqa',
            'irbid', 'jarash', 'maadaba',
            'maan', 'tafileh', 'zarqa'
        ],
        # CODE: "utils.governorate_code({0})",
        CODE: "governorate_code({0})",
        NA_FILL_VALUE: 'amman'
    },
    GENDER: {
        CATEGORIES: ['male', 'female'],
        # CODE: "utils.gender_code({0})",
        CODE: "gender_code({0})",
        NA_FILL_VALUE: 'male'
    },

}


def preprocess_recs(rec, cols_lst):
    dct = {}
    for column in cols_lst:
        value = rec.get(column, None)
        res = eval(TEMPLATE_DATA_CATEGORIES[column][CODE].format('value'))
        dct[column] = res
        
    print('After preprocessing:', dct)
    return dct


def load_encoders(into = None, path=None):
    into = into.copy() if into is not None else {}
    
    
    if path is None:
        path = os.path.join(MODELS_OUTPUT_PATH, f'*_encoders.joblib')
        
    for enc in glob.iglob(path):
        name = enc.split('\\')[-1].split('_')[0]
        into[name] = joblib.load(enc)
        
    print("Loaded encoders", into)
    return into
        

def encode_new_data(data, cols_lst, encoders, onehot_encoded, label_encoded):
    model_input = {}
    for feature in cols_lst:
        value = data.get(feature, None)
        # print(value)
        if feature in label_encoded:
            model_input[feature] = int(encoders[feature].transform([value])[0]) if value is not None else value

        elif feature in onehot_encoded:
            if value is not None:
                item = encoders[feature].transform([[value]])[-1]
                lst = {name: int(value) for name, value in zip(encoders[feature].get_feature_names_out(), item)}
            else:
                lst = {
                    i: None for i in encoders[feature].get_feature_names_out()
                }
            model_input.update(lst)
            
        else:
            model_input[feature] = int(value) if value is not None else value
    return model_input
    
def reorder_cols(data_dct, order):
    res = pd.DataFrame(data_dct, index=[0])[order]
    if data_dct[GENDER] == 0 or data_dct[GENDER] == 'female':
        res.drop('governorate_outside_jordan', axis=1, inplace=True)
    return res,  data_dct[GENDER]


def follow_the_crowd_for_missing_value(model, data): # this is a brute force solution. I beleive there is a better one can be found
    """This function build the tree structure then select a node for it iff it has missing values """
    n_nodes = model.tree_.node_count
    children_left = model.tree_.children_left
    children_right = model.tree_.children_right
    feature = model.tree_.feature
    name = model.feature_names_in_
    threshold = model.tree_.threshold
    samples = model.tree_.n_node_samples

    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, 0)]  # start with the root node id (0) and its depth (0)
    while len(stack) > 0:
        # `pop` ensures each node is only visited once
        node_id, depth = stack.pop()
        node_depth[node_id] = depth

        # If the left and right child of a node is not the same we have a split
        # node
        is_split_node = children_left[node_id] != children_right[node_id]
        # If a split node, append left and right children and depth to `stack`
        # so we can loop through them
        if is_split_node:
            stack.append((children_left[node_id], depth + 1))
            stack.append((children_right[node_id], depth + 1))
        else:
            is_leaves[node_id] = True

    # print( "The binary tree structure has {n} nodes, {l} leaves and has the following tree structure:\n".format(n=n_nodes, l=np.sum(is_leaves)))
    current_node = 0
    for i in range(n_nodes):
        # print('current_node is', current_node, end=' ::: ')
        if is_leaves[i]:
            # print("{space}node={node} is a leaf node.".format(space=node_depth[i] * "\t", node=i))
            pass
        else:
            # print(f"Check for {name[feature[i]].upper()}")
            if data[name[feature[i]]].values in [np.nan, None]:
                if current_node == i:
                    # print(f"{node_depth[i] * '    '}node={i} is a split node for None values: go to node {children_left[i]} if SAMPLES(node {children_left[i]}) {samples[children_left[i]]} >= SAMPLES(node {children_right[i]}) {samples[children_right[i]]} else to node {children_right[i]}.")
                    if samples[children_left[i]] >= samples[children_right[i]]:
                        current_node = children_left[i]
                    else:
                        current_node = children_right[i]
            else:
                if current_node == i:
                    # print(f"{node_depth[i] * '    '}node={i} is a split node: go to node {children_left[i]} if X[:, {feature[i]} {name[feature[i]]}] <= {threshold[i]} else to node {children_right[i]}.")
                    if data[name[feature[i]]].values <= threshold[i]:
                        current_node = children_left[i]
                    else:
                        current_node = children_right[i]
        
        if is_leaves[current_node]:
            # print(f'Node {current_node} is a leaf node. Its the node where this person will land')
            # return current_node, np.argmax(model.tree_.value[current_node])
            return current_node, samples[current_node]
        
    # return current_node, np.argmax(model.tree_.value[current_node])
    return current_node, samples[current_node]

def surv_tree_predict_node(gender, data=None):
    print('Loading Model:', os.path.join(MODELS_OUTPUT_PATH, f'{gender}_surv_tree_model.joblib'))
    surv_tree = joblib.load(os.path.join(MODELS_OUTPUT_PATH, f'{gender}_surv_tree_model.joblib'))
    # node = np.round(surv_tree.predict(data), 8).astype(object)
    
    print('Loading Mapper:', os.path.join(MODELS_OUTPUT_PATH, f'{gender}_cluster_node_dict_mapper.joblib'))
    node_cluster_mapper = joblib.load(os.path.join(MODELS_OUTPUT_PATH, f'{gender}_cluster_node_dict_mapper.joblib'))
    
    
    print('Loading Mapper Corrector:', os.path.join(MODELS_OUTPUT_PATH, f'{gender}_cluster_node_dict_cmapper.joblib'))
    cluster_corrector_mapper = joblib.load(os.path.join(MODELS_OUTPUT_PATH, f'{gender}_cluster_node_dict_cmapper.joblib'))
    
    if None in data.values or np.nan in data.values:
        print('This rec has missing.')
        node, samples_c = follow_the_crowd_for_missing_value(surv_tree, data)
        print(f"Follow the crowd lead to node -> {node}, with samples of {samples_c}")
        cluster = node_cluster_mapper[node]
        print(f"Node to Cluster Mapping lead to -> {cluster}")
    else:
        print('This rec is complete.')
        node = surv_tree.tree_.apply(data.to_numpy().astype('float32'))
        print(f"Using the SurvTree prediction lead to node -> {node}")
        cluster = node_cluster_mapper[node[-1]]
        print(f"Node to Cluster Mapping lead to -> {cluster}")

    # if list(cluster_corrector_mapper.keys()) != list(cluster_corrector_mapper.values()):
    cluster = cluster_corrector_mapper[cluster]
    print(f"Correcting the Cluster lead to -> {cluster}")
    
    return cluster, node


def predict(data_dct, contributers, encoders, onehot, labeled, encoders_outputs_feature,):
    data = data_dct.copy()
    print("Data Recived:", data)
    data = preprocess_recs(
        data, 
        contributers
    )

    data = encode_new_data(data, contributers, encoders, onehot, labeled)
    
    data, gen = reorder_cols(data, encoders_outputs_feature)
    # display(data)
    
    return surv_tree_predict_node(
        gender='male' if gen == 1 else 'female',
        data=data
    )
    

In [132]:
API_CONTRIBUTORS = [GOVERNORATE, AGE, EXPERIENCE, EDUCATION, GENDER, DISABILITY]
API_ONEHOT_ENCODED = [GOVERNORATE, EDUCATION,]
API_LABEL_ENCODED = [GENDER, DISABILITY]




API_ENCODERS_OUTPUT_ITEMS = [
    'experience', 'age', 'disability', 
    'governorate_ajloun', 'governorate_al_aqaba', 'governorate_al_kirk', 'governorate_al_mafraq', 
    'governorate_amman', 'governorate_balqa', 'governorate_irbid', 'governorate_jarash', 'governorate_maadaba', 
    'governorate_maan', 'governorate_outside_jordan', 'governorate_tafileh', 'governorate_zarqa', 'education_bachelor_or_above', 
    'education_middle_diploma', 'education_secondary_or_below', 'education_vocational_training'
]



API_ENCODERS = {}
API_ENCODERS = load_encoders(API_ENCODERS, path=None)

# x = {
#     GOVERNORATE:'irbed',
#     AGE:25,
#     EXPERIENCE: 2.586301,
#     EDUCATION:'bachelor',
#     GENDER:'male',
#     # GENDER:'female',
#     DISABILITY:'no_disability',
# }

x = {
    AGE:20,
    GENDER:'male',
    DISABILITY:'no_disability',
    GOVERNORATE:None,
    EDUCATION:'secondary_or_below',
    EXPERIENCE: 5,
    # GENDER:'female',
    
}


# After preprocessing: {'governorate': 'amman', 'age': 30, 'experience': 5, 'education': 'secondary_or_below', 'gender': 'male', 'disability': 'no_disability'}

predict(x, API_CONTRIBUTORS, API_ENCODERS, API_ONEHOT_ENCODED, API_LABEL_ENCODED, API_ENCODERS_OUTPUT_ITEMS, )
# x = preprocess_recs(
#     x, 
#     API_CONTRIBUTORS
# )

# x = encode_new_data(x, API_CONTRIBUTORS, API_ENCODERS, API_ONEHOT_ENCODED, API_LABEL_ENCODED)

# x, gen = reorder_cols(x, API_ENCODERS_OUTPUT_ITEMS)
# display(x)

# surv_tree_predict_node(gender='male' if gen else 'female', data=x)

Loaded encoders {'disability': LabelEncoder(), 'education': OneHotEncoder(sparse=False), 'gender': LabelEncoder(), 'governorate': OneHotEncoder(sparse=False)}
Data Recived: {'age': 20, 'gender': 'male', 'disability': 'no_disability', 'governorate': None, 'education': 'secondary_or_below', 'experience': 5}
After preprocessing: {'governorate': None, 'age': 20, 'experience': 5, 'education': 'secondary_or_below', 'gender': 'male', 'disability': 'no_disability'}
Loading Model: .\runs\male_surv_tree_model.joblib
Loading Mapper: .\runs\male_cluster_node_dict_mapper.joblib
Loading Mapper Corrector: .\runs\male_cluster_node_dict_cmapper.joblib
This rec has missing.
Follow the crowd lead to node -> 66, with samples of 523
Node to Cluster Mapping lead to -> 0
Correcting the Cluster lead to -> 1


(1, 66)

In [None]:
raise # belwo is the section for testing, several changes was made after this one, so it is not valid now. 

In [None]:
# df = pd.read_csv('./data/testing.csv', sep=',')

# df.columns = [c.lower().strip(' ') for c in df.columns]
# df = df[['experience', 'age', 'governorate', 'disability', 'education', 'gender',  'cluster', 'node']]

# df.education = df.education.str.replace(' ', '_')
# df.governorate = df.governorate.str.replace(' ', '_')
# df.disability = df.disability.str.replace(' ', '_')


# 0, 1, 2 --> 50
# 1, 0, 2 --> 21
# 2, 1, 0 --> 25
# 0, 2, 1 --> 52
# 1, 2, 0 --> 34
# 2, 0, 1 --> 14


# con = np.logical_and(
#     df.gender == 'male',
#     df.cluster == 'A',
# )
# df.loc[con, 'cluster'] = 0

# con = np.logical_and(
#     df.gender == 'male',
#     df.cluster == 'B',
# )
# df.loc[con, 'cluster'] = 2

# con = np.logical_and(
#     df.gender == 'male',
#     df.cluster == 'C',
# )
# df.loc[con, 'cluster'] = 1




# 0, 1, 2 --> 06
# 0, 2, 1 --> 08
# 1, 0, 2 --> 71
# 1, 2, 0 --> 13
# 2, 1, 0 --> 19
# 2, 0, 1 --> 80


# con = np.logical_and(
#     df.gender == 'female',
#     df.cluster == 'D',
# )
# df.loc[con, 'cluster'] = 2

# con = np.logical_and(
#     df.gender == 'female',
#     df.cluster == 'E',
# )
# df.loc[con, 'cluster'] = 1

# con = np.logical_and(
#     df.gender == 'female',
#     df.cluster == 'F',
# )
# df.loc[con, 'cluster'] = 0
# df


# df.columns = ['experience', 'age', 'governorate', 'disability', 'education', 'gender',  'cluster.1']

# df.loc[df.education == 'high_school', 'education'] = 'secondary_or_below'
# df.loc[df.education == 'bachelor', 'education'] = 'secondary_or_below'
# df.loc[df.education == 'deploma', 'education'] = 'middle_diploma'
# df.loc[df.education == 'high_diploma', 'education'] = 'middle_diploma'
# df.loc[df.education == 'master', 'education'] = 'bachelor_or_above'
# df.loc[df.education == 'phd', 'education'] = 'bachelor_or_above'
# df.loc[df.education == 'bachelor', 'education'] = 'bachelor_or_above'
# df.educations.unique()

# df.loc[df.governorate == 'kark', 'governorate'] = 'al_kirk'
# df.loc[df.governorate == 'mafraq', 'governorate'] = 'al_mafraq'
# df.loc[df.governorate == 'irbed', 'governorate'] = 'irbid'
# df.loc[df.governorate == 'jerash', 'governorate'] = 'jarash'
# df.loc[df.governorate == 'aqaba', 'governorate'] = 'al_aqaba'
# df.loc[df.governorate == 'madaba', 'governorate'] = 'maadaba'
# df.loc[df.governorate == 'ajloun', 'governorate'] = 'amman'
# df.Governorate.unique()

In [None]:
df = pd.read_csv('./data/testing.csv', sep=',')

df.columns = [c.lower().strip(' ') for c in df.columns]
df = df[['experience', 'age', 'governorate', 'disability', 'education', 'gender',  'cluster', 'node']]

df.education = df.education.str.replace(' ', '_')
df.governorate = df.governorate.str.replace(' ', '_')
df.disability = df.disability.str.replace(' ', '_')


con = np.logical_and(
    ~df.isnull().any(axis=1),
    df.gender == 'female',
)

full = df[con]
# full = df
display(full)

res_c = []
res_n = []
counter = 0
for row in full.iterrows():
    # print(row[-1].to_dict())
    correct = row[-1]['cluster']
    pred, node = predict(row[-1].to_dict(), API_CONTRIBUTORS, API_ENCODERS, API_ONEHOT_ENCODED, API_LABEL_ENCODED, API_ENCODERS_OUTPUT_ITEMS, )
    print(row[0], pred, correct, 'Match' if correct == pred else 'Not Match')
    counter += 1  if correct == pred else 0
    res_c.append(pred)
    res_n.append(node)
    # print('-'*50, 'NEXT ROUND', )
    
full['pred'] = res_c
full['pred_node'] = res_n

In [22]:
print('Accuracy Score:', counter / len(full))
display(full.groupby(['cluster', 'node']).count())
display(full.groupby(['pred', 'node']).count())

In [None]:
# con = np.logical_and(
#     full.node == 31 ,
#     full.pred == 2
# )

# full[con]

In [None]:
# final_df = pd.read_csv('./data/final_outputs.csv')
# print(final_df.shape)
# res_c = []
# res_n = []
# counter = 0
# for row in final_df.iterrows():
#     # print(row[-1].to_dict())
#     correct = row[-1]['clusters']
#     pred, node = predict(row[-1].to_dict(), API_CONTRIBUTORS, API_ENCODERS, API_ONEHOT_ENCODED, API_LABEL_ENCODED, API_ENCODERS_OUTPUT_ITEMS, )
#     print(row[0], pred, correct, 'Match' if correct == pred else 'Not Match')
#     counter += 1  if correct == pred else 0
#     res_c.append(pred)
#     res_n.append(node)
#     # print('-'*50, 'NEXT ROUND', )
    
# final_df['pred'] = res_c
# final_df['pred_node'] = res_n

In [None]:
# print('Accuracy Score:', counter / len(final_df))

In [None]:
# final_df.loc[final_df.pred != final_df.clusters]

In [None]:
# print('Loading Encoder:', os.path.join(MODELS_OUTPUT_PATH, 'male_surv_tree_nodes_encoder.joblib'))
# surv_tree_encoder = joblib.load(os.path.join(MODELS_OUTPUT_PATH, 'male_surv_tree_nodes_encoder.joblib'))

In [None]:
# surv_tree_encoder.classes_

In [None]:
# dir(surv_tree_encoder)

In [None]:
# surv_tree_encoder

Below, I am building a DecisionTreeClassifier over the results of the statistical analysis and results above. The benefits of this are.

1- Less implementation time in the API

2- Faster predicting time

3- Less componenet to build in the API part

4- Less preprocessing steps in the API part

5- single and unified referance for modifing the code

Just Have Fun

In [None]:
raise

In [None]:
# final_df.groupby(list(final_df.columns[2:])).size().reset_index().rename(columns={0:'count'}).sort_values('count').iloc[1200:1250]#.drop('count', axis=1)
# final_df['age_cat'] = final_df.age.apply(age_code)
# final_df['experience_cat'] = final_df.experience.apply(experience_code)
# final_df

In [None]:
# x = final_df.groupby(list(final_df.columns)[2:]).size().reset_index().rename(columns={0:'count'}).drop('count', axis=1).set_index(API_ATTRIBUTES[2:] + ['age_cat', 'experience_cat']).T.to_dict('records')[0]
# x

In [None]:
# ff = pd.DataFrame()
# count = 0
# for (governorate, disability, education, gender, age, experience,), cluster in x.items():
#     print(experience, age, governorate, disability, education, gender, cluster)
#     con = np.logical_and(
#             final_df.experience_cat == experience,
#             np.logical_and(
#                 final_df.age_cat == age,
#                 np.logical_and(
#                     final_df.governorate == governorate,
#                     np.logical_and(
#                         final_df.disability == disability,
#                         np.logical_and(
#                             final_df.education == education,
#                             final_df.gender == gender
#                         )
#                     )
#                 )
#             )
#         )
#     print(final_df[con].shape[0])
#     if final_df[con].shape[0] < 800:
#         counts = 800 - final_df[con].shape[0]
#         print(counts)
#         tmp = generate_fake_dataframe(
#             counts, 
#             'ffcccc', 
#             col_names = ['experience', 'age', 'disability', 'governorate', 'education', 'gender',], 
#             intervals = [(0, 25), (15, 59.9), ('disability', 1), ('governorate', 1),  ('education', 1), ('gender', 1)], 
#             seed = 42,
#             cats = [[governorate], [disability], [education], [gender]]
#         )
#         tmp['clusters'] = cluster
#         tmp['age_cat'] = age
#         tmp['experience_cat'] = experience
        
#         tmp = pd.concat([tmp, final_df[con]], ignore_index=True)
#     elif final_df[con].shape[0] > 8000:
#         tmp = final_df[con].sample(n = 800)
#     count += 1
#     ff = pd.concat([ff, tmp], ignore_index=True)
#     print(f'--------- Round {count} done --------')

# ff.to_csv('./data/final_generated_stratified_examples.csv', index=False)

In [None]:
# 1- skewed -- cases appears only 1 time -- data engineering
# 2- drop in accuracy -- fine tuning the model and increasing the data
# 3- statistical issue for use cases # 1_000_000

In [None]:
# final_df = pd.DataFrame()
# final_df = pd.read_csv('./data/dummy_final_outputs.csv')
final_df = pd.read_csv('./data/final_generated_stratified_examples.csv')
final_df

In [None]:
API_ATTRIBUTES = [EXPERIENCE, AGE, GOVERNORATE, DISABILITY, EDUCATION, GENDER, ] # NODES
API_TARGET = [CLUSTER]

In [None]:
API_CONTRIBUTORS = [DISABILITY, GENDER, GOVERNORATE, EDUCATION, EXPERIENCE, AGE, ]

# API_CONTRIBUTORS = [DISABILITY, GENDER, ] # nodes are to drop, clusters are the target
# API_CONTRIBUTORS += [f'{GOVERNORATE}_{item}' for item in ['al_kirk', 'zarqa', 'amman', 'irbid', 'al_aqaba', 'jarash', 'balqa', 'maadaba', 'tafileh', 'ajloun', 'maan', 'al_mafraq', 'outside_jordan', ]]
# API_CONTRIBUTORS += [f'{EDUCATION}_{item}' for item in ['secondary_or_below', 'vocational_training', 'bachelor_or_above', 'middle_diploma', ]]
# API_CONTRIBUTORS += [f'{EXPERIENCE}_{item}' for item in ['0', '1', '5', '10', '15', '20',]]
# API_CONTRIBUTORS += [f'{AGE}_{item}' for item in ['20', '30', '40', '50', '60']]

In [None]:
# df = pd.read_csv(DATA_OUTPUT_PATH)[API_ATTRIBUTES+API_TARGET].convert_dtypes(convert_string=False)
# if NODES in df.columns:
#     df.drop(NODES, axis=1, inplace=True)
# df.head()

In [None]:
# final_df = pd.concat([final_df, df.groupby(list(df.columns)).size().reset_index().rename(columns={0:'count'}).drop('count', axis=1)])
# final_df

In [None]:
# CONVERSION_DICT = final_df.set_index(API_ATTRIBUTES).T.to_dict('records')[0]
# CONVERSION_DICT

In [None]:
# import pandas as pd
# import numpy as np
# from itertools import cycle
# def generate_fake_dataframe(size, cols, col_names = None, intervals = None, seed = None, cats=None):
    
#     categories_dict = {
#         'governorate' : ['ajloun', 'al_aqaba', 'al_kirk', 'al_mafraq', 'amman', 'balqa', 'irbid', 'jarash', 'maadaba', 'maan', 'tafileh', 'zarqa', 'outside_jordan'] if cats is None else cats[0],
#         'disability': ['with_disability', 'no_disability',] if cats is None else cats[1],
#         'education' : ['bachelor_or_above', 'middle_diploma', 'secondary_or_below', 'vocational_training', ] if cats is None else cats[2],
#         'gender' : ['male', 'female', ] if cats is None else cats[3],
#     }
#     # default_intervals = {
#     #     "c" : ("disability", len(sample.disability.unique()) if sample is not None else  2), 
#     #     "c" : ("gender", len(sample.gender.unique()) if sample is not None else  2), 
#     #     "c" : ("education", len(sample.education.unique()) if sample is not None else  4), 
#     #     "c" : ("governorate", len(sample.governorate.unique()) if sample is not None else  12), 
#     # }
#     default_intervals = {"i" : (0,10), "f" : (0,100), "c" : ("governorate", 13), "d" : ("2020-01-01","2020-12-31")}
#     rng = np.random.default_rng(seed)

#     first_c = default_intervals["c"][0]
#     categories_names = cycle([first_c] + [c for c in categories_dict.keys() if c != first_c])
#     default_intervals["c"] = (categories_names, default_intervals["c"][1])
    
#     if isinstance(col_names, list):
#         assert len(col_names) == len(cols), f"The fake DataFrame should have {len(cols)} columns but col_names is a list with {len(col_names)} elements"
#     elif col_names is None:
#         suffix = {"c" : "cat", "i" : "int", "f" : "float", "d" : "date"}
#         col_names = [f"column_{str(i)}_{suffix.get(col)}" for i, col in enumerate(cols)]

#     if isinstance(intervals,list):
#         assert len(intervals) == len(cols), f"The fake DataFrame should have {len(cols)} columns but intervals is a list with {len(intervals)} elements"
#     else:
#         if isinstance(intervals,dict):
#             assert len(set(intervals.keys()) - set(default_intervals.keys())) == 0, f"The intervals parameter has invalid keys"
#             default_intervals.update(intervals)
#         intervals = [default_intervals[col] for col in cols]
#     df = pd.DataFrame()
#     for col, col_name, interval in zip(cols, col_names, intervals):
#         if interval is None:
#             interval = default_intervals[col]
#         assert (len(interval) == 2 and isinstance(interval, tuple)) or isinstance(interval, list), f"This interval {interval} is neither a tuple of two elements nor a list of strings."
#         if col in ("i","f","d"):
#             start, end = interval
#         if col == "i":
#             df[col_name] = rng.integers(start, end, size)
#         elif col == "f":
#             df[col_name] = np.round(rng.uniform(start, end, size), 3)
#         elif col == "c":
#             if isinstance(interval, list):
#                 categories = np.array(interval)
#             else:
#                 cat_family, length = interval
#                 if isinstance(cat_family, cycle):
#                     cat_family = next(cat_family)
#                 print(cat_family)
#                 assert cat_family in categories_dict.keys(), f"There are no samples for category '{cat_family}'. Consider passing a list of samples or use one of the available categories: {categories_dict.keys()}"
#                 print(categories_dict[cat_family], length)
#                 categories = rng.choice(categories_dict[cat_family], length, replace = False, shuffle = True)
#             df[col_name] = rng.choice(categories, size, shuffle = True)
#         elif col == "d":
#             df[col_name] = rng.choice(pd.date_range(start, end), size)
#     return df

In [None]:
# fake_df = generate_fake_dataframe(
#     counts, 
#     'ffcccc', 
#     col_names = ['experience', 'age', 'disability', 'governorate', 'education', 'gender',], 
#     intervals = [(0, 25), (15, 59.9), ('disability', 2), ('governorate', 13),  ('education', 4), ('gender', 2)], 
#     seed = 42,
# )
# fake_df

In [None]:
# def experience_code(value):
#     if value > 15:
#         return 20
#     elif value > 10:
#         return 15
#     elif value > 5:
#         return 10
#     elif value > 1:
#         return 5
#     elif value > 0.5:
#         return 1
#     else:
#         return 0
    
# # - Age:
# def age_code(value):
#     return (np.round(value / 10) * 10).astype(int)
    

    
# fake_df['categorized_age'] = (np.round(fake_df.age / 10) * 10).astype(int)
# fake_df['categorized_experience'] = fake_df.experience.apply(experience_code)
# fake_df[CLUSTER] = -1
# fake_df

In [None]:
# def fill_clusters_based_examples(dct, data):
#     count = 0
#     for (experience, age, governorate, disability, education, gender), cluster in dct.items():
#         print(experience, age, governorate, disability, education, gender, cluster)
#         con = np.logical_and(
#                 data.categorized_experience == experience,
#                 np.logical_and(
#                     data.categorized_age == age,
#                     np.logical_and(
#                         data.governorate == governorate,
#                         np.logical_and(
#                             data.disability == disability,
#                             np.logical_and(
#                                 data.education == education,
#                                 data.gender == gender
#                             )
#                         )
#                     )
#                 )
#             )
#         data.loc[con, CLUSTER] = cluster
#         # display(data[con])
#         print(data[con].shape)
#         count += 1
#         print(f'--------- Round {count} done --------')
#     return data
# tmp = fill_clusters_based_examples(CONVERSION_DICT, fake_df)
# tmp 

In [None]:
# x = tmp.drop(tmp[tmp.clusters == -1].index)
# x.drop(['categorized_age', 'categorized_experience'], axis=1, inplace=True)
# x.reset_index(drop=True, inplace=True)
# x

In [None]:
# final_df = pd.concat([final_df, x], ignore_index=True)
# final_df.to_csv('./data/dummy_final_outputs.csv', index=False, encoding='UTF-8')
# del fake_df, x, tmp, CONVERSION_DICT, df
# final_df

In [None]:
# print(final_df.info())
# final_df.groupby('clusters').hist()


# # from scipy import stats
# # plt.hist(stats.boxcox(final_df.experience + 1)[0])

In [None]:
ONE_HOT_ENCODED_FEATURES = []
LABEL_ENCODED_FEATURES = [GENDER, DISABILITY, GOVERNORATE, EDUCATION,] #  AGE, EXPERIENCE, 

COLUMNS_TO_ENCODE = ONE_HOT_ENCODED_FEATURES + LABEL_ENCODED_FEATURES
print(ONE_HOT_ENCODED_FEATURES, LABEL_ENCODED_FEATURES)

def encode(data, columns, onehot, labeled, ref={}, store=False):
    for col in columns:
        if col in labeled:
            data[col] = data[col].astype(object)
            encoder = LabelEncoder()
            data[col] = encoder.fit_transform(data[col])
            
        elif col in onehot:
            data[col] = data[col].astype(object)
            encoder = OneHotEncoder(sparse=False)
            temp = pd.DataFrame(
                encoder.fit_transform(data[[col]]),
                columns=list(encoder.get_feature_names_out())
            )
            data = pd.concat([data, temp], axis=1).drop(col, axis=1)
            del temp
            
        else:
            raise ValueError(f"'{col}' can't be encoded.")
        
        ref[col] = encoder
        if store:
            joblib.dump(encoder, os.path.join(MODELS_OUTPUT_PATH, f'{encoder}_encoder.joblib'))
            
    return data, ref

In [None]:
# final_df.age = final_df.age.apply(age_code)
# final_df[EXPERIENCE] = final_df[EXPERIENCE].apply(experience_code)
# final_df

In [None]:
final_dfe, enc = encode(
    data=final_df,
    columns=COLUMNS_TO_ENCODE,
    onehot=ONE_HOT_ENCODED_FEATURES,
    labeled=LABEL_ENCODED_FEATURES,
)
print(enc)
final_dfe

In [None]:
# final_dfe.groupby(CLUSTER).count()

In [None]:
temp = final_dfe[final_dfe.gender == 1]

x_train,x_test, y_train, y_test = train_test_split(temp.drop(CLUSTER, axis=1), temp[CLUSTER], test_size=0.15, random_state=42)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GridSearchCV, HalvingGridSearchCV


# from sklearn.svm import LinearSVC
# from sklearn.kernel_approximation import Nystroem
# from sklearn.linear_model import SGDClassifier
# from sklearn.svm import SVC
# from sklearn.naive_bayes import CategoricalNB
# from sklearn.neighbors import KNeighborsClassifier

# from sklearn.tree import DecisionTreeClassifier
# from xgboost import XGBClassifier()

In [None]:
# std_slc = StandardScaler()
# dec_tree = tree.DecisionTreeClassifier()
dec_tree = RandomForestClassifier(n_jobs=-1)
# print(dec_tree.get_params().keys())
pipe = Pipeline(steps=[
    # ('std_slc', std_slc),
    ('dec_tree', dec_tree)
])


# random_state = [None, 0, 1, 42]
criterion = ['gini', 'entropy', ]
# splitter = ['best', 'random']
max_depth = [6, 7, 8, 9, ]
min_samples_split = [50, 500, 1500, ]
min_samples_leaf = [800, 1300, 1800, ]
max_features = ['auto', None]
class_weight = [None, 'balanced',]
n_estimators =  [25, 100, 150, ]
oob_score = [True, False]
bootstrap = [True]

parameters = dict(
    # dec_tree__random_state=random_state,
    dec_tree__criterion=criterion,
    # dec_tree__splitter=splitter,
    dec_tree__max_depth=max_depth,
    dec_tree__min_samples_split=min_samples_split,
    dec_tree__min_samples_leaf=min_samples_leaf,
    dec_tree__max_features=max_features,
    dec_tree__class_weight=class_weight,
    dec_tree__n_estimators=n_estimators,
    dec_tree__oob_score=oob_score,
    dec_tree__bootstrap=bootstrap,
    # dec_tree__n_jobs=n_jobs
)

In [None]:
clf_GS = HalvingGridSearchCV(
    pipe, 
    parameters, 
    cv=5, 
    factor=2, 
    min_resources='exhaust',
    aggressive_elimination=True,
)
clf_GS.fit(x_train, y_train)
print('Best criterion:', clf_GS.best_estimator_.get_params()['dec_tree__criterion'])
# print('Best splitter:', clf_GS.best_estimator_.get_params()['dec_tree__splitter'])
print('Best max_depth:', clf_GS.best_estimator_.get_params()['dec_tree__max_depth'])
print('Best min_samples_split:', clf_GS.best_estimator_.get_params()['dec_tree__min_samples_split'])
print('Best min_samples_leaf:', clf_GS.best_estimator_.get_params()['dec_tree__min_samples_leaf'])
print('Best max_features:', clf_GS.best_estimator_.get_params()['dec_tree__max_features'])
print('Best class_weight:', clf_GS.best_estimator_.get_params()['dec_tree__class_weight'])
print()
print(clf_GS.best_estimator_.get_params()['dec_tree'])
pd.DataFrame(clf_GS.cv_results_).tail(20)

In [None]:
# model = DecisionTreeClassifier(
#     criterion='entropy', 
#     max_depth=5, 
#     min_samples_split= 800,
#     min_samples_leaf= 750,
#     splitter='best',
#     max_features=None,
#     class_weight=None,
# )


model = RandomForestClassifier(
    max_depth=9, 
    min_samples_leaf=700, 
    min_samples_split=50,
    n_jobs=-1,
    max_features='auto'
)
model.fit(x_train, y_train)

y_pred = model.predict(x_train)
print(f"Model Accuracy on TRAINING data:", accuracy_score(y_train, y_pred))
print(classification_report(y_train, y_pred))
print('-' * 60)
y_pred = model.predict(x_test)
print(f"Model Accuracy on TESTING data:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
if isinstance(model, DecisionTreeClassifier):
    text_representation = tree.export_text(model)
    # print(text_representation)

    fig = plt.figure(figsize=(25,20))
    _ = tree.plot_tree(
        model, 
        feature_names=final_dfe.drop(CLUSTER, axis=1).columns,  
        class_names=['0', '1', '2', ],
        filled=True,
        # proportion=True,
        fontsize=7,
        impurity=False,
        node_ids=True
    )

In [None]:
df = pd.read_csv(DATA_OUTPUT_PATH)[API_ATTRIBUTES+API_TARGET]
# df.experience = df.experience.astype(object)
# df.age = df.age.astype(object)
df.info()

In [None]:
for col in df.columns:
    if col in enc.keys():
        df[col] = enc[col].transform(df[col])
df.head()

In [None]:
print(accuracy_score(df[CLUSTER], model.predict(df.drop([CLUSTER], axis=1))))

In [None]:
# predect_node_for_missing_value(api_model, [0, 20, 0, 1, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan])

In [None]:
# for encoder in API_ENCODERS:
#     joblib.dump(API_ENCODERS[encoder], os.path.join(MODELS_OUTPUT_PATH, f'{encoder}_encoder.joblib'))
# joblib.dump(api_model, os.path.join(MODELS_OUTPUT_PATH, 'model.joblib'))

In [None]:
df = pd.read_csv('./data/testing.csv')
df = df[['Experience', 'Age', 'Governorate', 'Disability', 'Educations', 'Gender',  'Cluster.1']]
df

In [None]:
df.loc[df.Educations == 'high_school', 'Educations'] = 'secondary_or_below'
df.loc[df.Educations == 'bachelor', 'Educations'] = 'secondary_or_below'
df.loc[df.Educations == 'deploma', 'Educations'] = 'middle_diploma'
df.loc[df.Educations == 'high_diploma', 'Educations'] = 'middle_diploma'
df.loc[df.Educations == 'master', 'Educations'] = 'bachelor_or_above'
df.loc[df.Educations == 'phd', 'Educations'] = 'bachelor_or_above'
df.loc[df.Educations == 'bachelor', 'Educations'] = 'bachelor_or_above'
df.Educations.unique()

In [None]:
df.loc[df.Governorate == 'kark', 'Governorate'] = 'al_kirk'
df.loc[df.Governorate == 'mafraq', 'Governorate'] = 'al_mafraq'
df.loc[df.Governorate == 'irbed', 'Governorate'] = 'irbid'
df.loc[df.Governorate == 'jerash', 'Governorate'] = 'jarash'
df.loc[df.Governorate == 'aqaba', 'Governorate'] = 'al_aqaba'
df.loc[df.Governorate == 'madaba', 'Governorate'] = 'maadaba'
df.loc[df.Governorate == 'ajloun', 'Governorate'] = 'amman'
df.Governorate.unique()

In [None]:
full = df[~df.isnull().any(axis=1)]
full

In [None]:
# full.Age = full.Age.apply(age_code).astype(object)
# full.Experience = full.Experience.apply(experience_code).astype(object)
for col in full.columns:
    if col.lower() in enc.keys():
        full[col] = enc[col.lower()].fit_transform(full[col])
full['Educations'] = enc['education'].transform(full['Educations'])
full.head()

In [None]:
print(accuracy_score(full[full.Gender == 1]['Cluster.1'], model.predict(full[full.Gender == 1].drop(['Cluster.1'], axis=1))))

In [None]:
full.Age = full.Age.apply(age_code)
full.Experience = full.Experience.apply(experience_code)


print(accuracy_score(full[full.Gender == 1]['Cluster.1'], model.predict(full[full.Gender == 1].drop(['Cluster.1'], axis=1))))

In [None]:
# full['categorized_experience']  = full.Experience.apply(experience_code)
# full['categorized_age'] = full.Age // 10 * 10
# full

# def fill_clusters_based_examples(dct, data):
#     count = 0
#     for (experience, age, governorate, disability, education, gender), cluster in dct.items():
#         print(experience, age, governorate, disability, education, gender, cluster)
#         con = np.logical_and(
#                 data.categorized_experience == experience,
#                 np.logical_and(
#                     data.categorized_age == age,
#                     np.logical_and(
#                         data.Governorate == governorate,
#                         np.logical_and(
#                             data.Disability == disability,
#                             np.logical_and(
#                                 data.Educations == education,
#                                 data.Gender == gender
#                             )
#                         )
#                     )
#                 )
#             )
#         data.loc[con, CLUSTER] = cluster
#         # display(data[con])
#         print(data[con].shape)
#         count += 1
#         print(f'--------- Round {count} done --------')
#     return data
# fill_clusters_based_examples(CONVERSION_DICT, full)

In [None]:
# 1278
# 12M ~ 1M
# 2 * 2 * 4 * 13 * 6 * 6