In [None]:
import os
import glob
import logging
import time

import matplotlib.pyplot as plt
import plotly.express as px

import numpy as np
import pandas as pd
from clustergram import Clustergram as CGram

import scipy
from scipy import signal
from scipy import stats

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression as LR

# Dim. reduction and clasterization

## Preprocessing

#### Raw/norm data frames

In [None]:
pf_df = pd.read_csv('???')

abs_df = pf_df.copy()
abs_df = abs_df.iloc[:,:12]

norm_df = pf_df.copy()
norm_df = norm_df.drop(list(norm_df.columns[9:12]), axis=1)

opti_df = pf_df.copy()


work_df = abs_df

#### Features importance estimation

In [None]:
# https://python-bloggers.com/2021/01/3-essential-ways-to-calculate-feature-importance-in-python/

abs_features_vals = abs_df.iloc[:,6:].values

abs_pca = PCA()
abs_features_pca = abs_pca.fit(X=abs_features_vals)

print(abs_features_pca)

plt.plot(abs_features_pca.explained_variance_ratio_.cumsum(), lw=3, color='#087E8B')
plt.show()


## PCA

#### PCA calc

In [None]:
def PCA_calc(features_vals, factor_df, n=2):
    tic = time.perf_counter()

    if n == 2:
        ax_list = ['x', 'y']
    elif n == 3:
        ax_list = ['x', 'y', 'z']
    else:
        logging.fatal('Incorrect dimension number!')

    pca = PCA(n_components=n)
    principal_res = pca.fit_transform(features_vals)
    df_principal = pd.DataFrame(data = principal_res,
                                columns = ax_list)
    df_principal = pd.concat([factor_df, df_principal], axis = 1)

    toc = time.perf_counter()
    logging.info(f'PCA calc in {toc - tic:0.4f} seconds')
    return df_principal

n_components = 3
features_vals = abs_df.iloc[:,6:].values
factor_df = abs_df.iloc[:,:6]

pca_df = PCA_calc(features_vals=features_vals, factor_df=factor_df, n=n_components)

#### PCA plot

In [None]:
group_factor = 'app_group'

if n_components == 2:
    fig = px.scatter(pca_df,
                     x='x', y='y',
                     color=group_factor,
                     symbol=group_factor)
    dot_size = 6
elif n_components == 3:
    fig = px.scatter_3d(pca_df,
                        x='x', y='y', z='z',
                        color=group_factor,
                        symbol=group_factor)
    dot_size = 2
else:
    logging.fatal('Incorrect n')

fig.update_traces(marker=dict(size=5))
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0),
                  legend= {'itemsizing': 'constant'},
                  scene=dict(xaxis=dict(showaxeslabels=False, showticklabels=False, showbackground=False, title=''),
                             yaxis=dict(showaxeslabels=False, showticklabels=False, showbackground=False, title=''), 
                             zaxis=dict(showaxeslabels=False, showticklabels=False, showbackground=False, title='')))
fig.show()

## LDA

#### LDA calc

In [None]:
def LDA_calc(features_vals, factor_df, group_column=None):
    tic = time.perf_counter()

    group_vals = factor_df.loc[:,group_column].values

    lda = LDA()
    lda_fit = lda.fit_transform(X=features_vals, y=group_vals)
    df_lda = pd.DataFrame({'lda':lda_fit[:,0]})
    df_lda = pd.concat([factor_df, df_lda], axis = 1)

    toc = time.perf_counter()
    logging.info(f'PCA calc in {toc - tic:0.4f} seconds')
    return df_lda

features_vals = norm_df.iloc[:,6:].values
lda_df = LDA_calc(features_vals=features_vals, factor_df=norm_df.iloc[:,:6], group_column='app_group')

# plotting
fig = px.box(lda_df, x="app_group", y="lda", color='app_group', points='all')
fig.show()

## Clustergram

In [None]:
cgram = CGram(range(1, 16), n_init=1000)

cgram.fit(norm_df.iloc[:,6:])

In [None]:
ax = cgram.plot(figsize=(10, 8))
ax.yaxis.grid(False)