## Import Libraries

In [1]:
from pathlib import Path
from typing import List, Tuple
import pandas as pd
import sys

_root = Path.cwd()

if not _root.joinpath("prometeus").exists():
    sys.path.insert(0, "../")
    
from prometeus.pca.analyzer import PCAnalyzer

## Normalize Dataset

In [16]:
def preprocess_data_cluster(df: pd.DataFrame, clusters: List[int] = None) -> Tuple[pd.DataFrame, pd.Series]:
    df.dropna(how='any', inplace=True)
    if clusters is not None:
        df = df[df['Cluster'].isin(clusters)]
    clusters = df['Cluster'].astype(int).astype(str)
    if 'Unnamed: 0' not in df.columns:
        df.drop(columns=['SERVICE', 'MONTH_SQN', 'CUST_SQN', 'Cluster'], inplace=True)
    else:
        df.drop(columns=['Unnamed: 0', 'SERVICE', 'MONTH_SQN', 'CUST_SQN', 'Cluster'], inplace=True)
    return df, clusters

In [12]:
from sklearn.preprocessing import StandardScaler

normalize_col = ['SH_CUST_RATING', 'CUST_LVL_MOB_ARPU_6M', 'TENURE_MOBILE', 'CUST_LVL_MOB_CNT', 'CONTRACT_DURATION', 'SUBS_TENURE',
                'LIFETIME_CONTRACTS_CNT', 'AUSAGE_SOCIALNET', 'AUSAGE_VIDEO', 'AUSAGE_COMMUNICATIONS', 'AUSAGE_NETFLIX',
                'AUSAGE_ECOMMERCE', 'AUSAGE_GAMES', 'AUSAGE_MUSIC', 'AUSAGE_MAIL','AUSAGE_LIFESTYLE', 'AUSAGE_NEWS', 'AUSAGE_TRANSPORTATION',
                'AUSAGE_NEWS', 'AUSAGE_TRANSPORTATION', 'USAGE_SOCIALNET', 'USAGE_STREAMINGVIDEO', 'USAGE_GAMES', 'USAGE_ENTERTAINMENT',
                'USAGE_MUSIC', 'USAGE_TRANSPORTATION', 'USAGE_SHOPPING', 'USAGE_ECOMMERCE', 'USAGE_SPORTS', 'USAGE_NEWS', 'USAGE_TRAVEL']

def normalize(df: pd.DataFrame) -> pd.DataFrame:
    scaler = StandardScaler()
    scaler.fit(df[normalize_col])
    scaled_target = df[normalize_col].copy()

    df[normalize_col]= scaler.transform(scaled_target)
    return df.fillna(0)

## Get Clustered DataFrame

In [4]:
df = pd.read_csv('Data/clustered_data_20210414.csv')
df, clusters = preprocess_data_cluster(df)

clustered_df = PCAnalyzer(normalize(df)).get_clustering(by='cum_var', n_components=22, cluster_size=5)

In [5]:
clustered_df.columns

Index(['SH_CUST_RATING', 'CUST_LVL_MOB_ARPU_6M', 'TENURE_MOBILE',
       'CUST_LVL_MOB_CNT', 'CONTRACT_DURATION', 'LIFETIME_CONTRACTS_CNT',
       'AUSAGE_SOCIALNET', 'AUSAGE_VIDEO', 'AUSAGE_COMMUNICATIONS',
       'AUSAGE_NETFLIX',
       ...
       'MALAY', 'EURASIAN', '>=60', '25-34', '35-44', '45-54', '55-59',
       '18-24', '<18', 'Cluster'],
      dtype='object', length=143)

In [6]:
## each re-run will result in different output

clustered_df.Cluster.value_counts()

0    201040
1     77836
2     28387
3     11348
4      5901
Name: Cluster, dtype: int64

## Get Loadings DataFrame

In [13]:
loadings = PCAnalyzer(normalize(df)).get_loadings(by='cum_var', n_components=22, is_filter=True)

In [15]:
loadings.T

Unnamed: 0,USAGE_SPORTS,USAGE_SOCIALNET,USAGE_ECOMMERCE,AUSAGE_MAIL,USAGE_TRAVEL,AUSAGE_NETFLIX,AUSAGE_SOCIALNET,AUSAGE_TRANSPORTATION,USAGE_TRANSPORTATION,AUSAGE_GAMES,USAGE_GAMES,AUSAGE_NEWS,USAGE_NEWS
PC1,0.0193662,0.0209408,0.0136642,0.00596696,0.035536,0.0765754,0.136988,-0.00762105,-0.0157605,0.0576768,0.0471392,-0.0263986,0.003886
PC2,0.0558135,0.123467,0.15933,0.229387,0.241435,0.348012,0.613419,0.340055,0.289108,0.299056,0.236908,0.111265,0.063178
PC3,-0.0256349,-0.0268135,-0.00168475,0.000346493,0.027487,-0.045519,-0.0772229,0.870334,0.878439,-0.106221,-0.0867783,0.0426329,0.00542843
PC4,0.00866566,0.0331045,0.13132,0.162719,0.13534,0.0581129,0.100407,-0.254815,-0.28136,0.0194145,0.0216751,0.18406,0.104693
PC5,0.057458,0.0345617,-0.13738,-0.114916,-0.14724,-0.00833579,-0.0344975,0.0631431,0.0864034,0.358611,0.332182,0.00202263,0.0454071
PC6,-0.0779718,-0.0254106,-0.0114554,-0.0149273,-0.0173742,0.0554419,-0.0622779,-0.00151838,-0.00361312,0.636274,0.670649,-0.190228,-0.216152
PC7,0.0169056,-0.0190633,0.011944,0.0336337,0.0399355,-0.0877645,-0.180779,0.0111182,0.0134053,-0.0892354,-0.088026,0.396165,0.361069
PC8,0.132947,0.00294619,-0.0866071,0.158602,-0.0367729,0.0866407,0.0825364,-0.0181433,-0.0188333,0.147283,0.147622,0.578844,0.577158
PC9,-0.22512,-0.121844,0.0025134,0.00498895,0.140823,-0.0741069,-0.14711,0.00188969,0.00173978,0.136269,0.174568,0.216654,0.223969
PC10,-0.0310468,0.0267962,-0.295983,0.233174,0.228572,0.44153,0.229467,-0.0331635,-0.0403975,-0.0932979,-0.115932,-0.0847703,-0.162158
