### Level 5 Variables Creation

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse.linalg import svds
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [5]:
# Load your data
pd.set_option('display.max_columns', None)
dna = pd.read_parquet('dna_pw_20250225.parquet')
dna = dna.drop(columns=['CL_ID2', 'CL_ID4', 'HMO_MEMBER']) # Remove HMO_MEMBER due to insufficient data
dna.head()

Unnamed: 0,MASKED_ID_NUM,IDV_OCP_TYP_ID,NBR_DPND,GENERATION,GENDER,MARITAL_STATUS,DIGITAL_FLAG,RISK_APPETITE,TRAVELLER,GEODIVERSITY,REGION,HOSPITAL_PAYOR,ENVIRONMENTAL_AFF,HUMANITARIAN_AFF,RELIGIOUS_AFF,FILCHI_CLUB,OF_CLUB,RETIREES_CLUB,MILLENNIAL_CLUB,EXECUTIVES_CLUB,PROFESSIONAL_CLUB,NEW_MERCH_NAME,TXN_AMT_TOT,TXN_AMT_AVE,TXN_CNT,MOST_COMMON_INDUSTRY
0,4016083633,STUDENT,3.0,BOOMERS,FEMALE,MARRIED,TRADITIONAL,AGGRESSIVE,HIGH,DIVERSE,NATIONAL CAPITAL REGION,NO_DATA,N,N,N,N,Y,N,Y,N,N,IN ROOM DINING,80099.0,14531.0,1.0,Record Stores
1,4016083633,SELFEMPLOYED,3.0,GEN_X,MALE,MARRIED,DIGITAL,AGGRESSIVE,NO_DATA,SINGLE,NATIONAL CAPITAL REGION,LOW,N,N,N,N,Y,Y,N,N,N,VANS 756,41075.0,2753.0,2.0,Drug Stores And Pharmacies
2,4016083633,SELFEMPLOYED,3.0,BOOMERS,FEMALE,MARRIED,DIGITAL,NO_DATA,NO_DATA,DIVERSE,NATIONAL CAPITAL REGION,MID,N,N,N,N,Y,Y,N,N,N,ZHIGUANKEJI,0.0,14504.0,2.0,Grocery Stores And Supermarkets
3,4016083633,EMPLOYED,3.0,BOOMERS,FEMALE,MARRIED,DIGITAL,NO_DATA,NO_DATA,MIGRATORY,NATIONAL CAPITAL REGION,NO_DATA,N,N,N,N,Y,Y,N,Y,N,ST. LUKE'S MEDICAL CENTER,0.0,5338.0,2.0,Education
4,4016083633,SELFEMPLOYED,3.0,GEN_X,FEMALE,MARRIED,DIGITAL,NO_DATA,HIGH,DIVERSE,NATIONAL CAPITAL REGION,NO_DATA,N,N,N,N,N,N,N,N,N,NETFLIX,12695.0,0.0,2.0,No Industry Label


In [7]:
print(dna['RISK_APPETITE'].unique())
print(dna['GEODIVERSITY'].unique())
print(dna['DIGITAL_FLAG'].unique())
print(dna['HOSPITAL_PAYOR'].unique())
print(dna['ENVIRONMENTAL_AFF'].unique())

['AGGRESSIVE' 'NO_DATA' 'MODERATELY_CONSERVATIVE' 'MODERATELY_AGGRESSIVE'
 'CONSERVATIVE']
['DIVERSE' 'SINGLE' 'MIGRATORY' 'NO_DATA']
['TRADITIONAL' 'DIGITAL' 'NO_DATA']
['NO_DATA' 'LOW' 'MID' 'HIGH']
['N' 'Y']


#### Data Pre-Processing

In [10]:
risk_mapping = {
    'NO_DATA': -1,  # Assign -1 to indicate missing data
    'CONSERVATIVE': 1,
    'MODERATELY_CONSERVATIVE': 2,
    'MODERATELY_AGGRESSIVE': 3,
    'AGGRESSIVE': 4
}

dna['RISK_APPETITE'] = dna['RISK_APPETITE'].map(risk_mapping)

In [12]:
hospital_payor_mapping = {
    'NO_DATA': -1,  # Assign -1 to indicate missing data
    'LOW': 1,
    'MID': 2,
    'HIGH': 3
}

dna['HOSPITAL_PAYOR'] = dna['HOSPITAL_PAYOR'].map(hospital_payor_mapping)

In [14]:
geodiversity_mapping = {
    'NO_DATA': -1,
    'SINGLE': 1,
    'MIGRATORY': 2,
    'DIVERSE': 3
}

dna['GEODIVERSITY'] = dna['GEODIVERSITY'].map(geodiversity_mapping)

In [16]:
digital_flag_mapping = {
    'NO_DATA': -1,
    'TRADITIONAL': 0,
    'DIGITAL': 1
}

dna['DIGITAL_FLAG'] = dna['DIGITAL_FLAG'].map(digital_flag_mapping)

In [18]:
binary_cols = ['ENVIRONMENTAL_AFF', 'HUMANITARIAN_AFF', 'RELIGIOUS_AFF', 'FILCHI_CLUB', 'OF_CLUB', 
               'RETIREES_CLUB', 'MILLENNIAL_CLUB', 'EXECUTIVES_CLUB', 'PROFESSIONAL_CLUB']

for col in binary_cols:
    dna[col] = dna[col].map({'Y': 1, 'N': 0})

In [20]:
dna = pd.get_dummies(dna, columns=['GENERATION', 'GENDER'], drop_first=True)

In [22]:
dna.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85142 entries, 0 to 85141
Data columns (total 30 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   MASKED_ID_NUM         85142 non-null  int64  
 1   IDV_OCP_TYP_ID        85142 non-null  object 
 2   NBR_DPND              85142 non-null  float64
 3   MARITAL_STATUS        85142 non-null  object 
 4   DIGITAL_FLAG          85142 non-null  int64  
 5   RISK_APPETITE         85142 non-null  int64  
 6   TRAVELLER             85142 non-null  object 
 7   GEODIVERSITY          85142 non-null  int64  
 8   REGION                85142 non-null  object 
 9   HOSPITAL_PAYOR        85142 non-null  int64  
 10  ENVIRONMENTAL_AFF     85142 non-null  int64  
 11  HUMANITARIAN_AFF      85142 non-null  int64  
 12  RELIGIOUS_AFF         85142 non-null  int64  
 13  FILCHI_CLUB           85142 non-null  int64  
 14  OF_CLUB               85142 non-null  int64  
 15  RETIREES_CLUB      

In [24]:
# Selecting relevant features (excluding categorical object-type features)
numerical_cols = ['NBR_DPND', 'DIGITAL_FLAG', 'RISK_APPETITE', 'GEODIVERSITY', 'HOSPITAL_PAYOR',
                  'ENVIRONMENTAL_AFF', 'HUMANITARIAN_AFF', 'RELIGIOUS_AFF', 'FILCHI_CLUB', 'OF_CLUB', 'RETIREES_CLUB', 
                  'MILLENNIAL_CLUB', 'EXECUTIVES_CLUB', 'PROFESSIONAL_CLUB',  'TXN_AMT_TOT', 'TXN_AMT_AVE', 'TXN_CNT', 
                  'GENERATION_GEN_X', 'GENERATION_GEN_Y',  'GENERATION_GEN_Z', 'GENERATION_NO_DATA', 'GENDER_MALE', 'GENDER_UNVERIFIED']

In [26]:
# Standardizing numerical columns
scaler = StandardScaler()
dna_scaled = scaler.fit_transform(dna[numerical_cols])

#### Merchant Similarity Score

In [32]:
# Create a bipartite graph
G = nx.Graph()

# Add nodes
customers = dna['MASKED_ID_NUM'].unique()
merchants = dna['NEW_MERCH_NAME'].unique()
G.add_nodes_from(customers, bipartite=0)  # Customer nodes
G.add_nodes_from(merchants, bipartite=1)  # Merchant nodes

# Add edges based on transactions
for _, row in dna.iterrows():
    G.add_edge(row['MASKED_ID_NUM'], row['NEW_MERCH_NAME'], weight=row['TXN_AMT_TOT'])

In [40]:
print('\n'.join(dna['NEW_MERCH_NAME'].unique()))

IN ROOM DINING
VANS 756
ZHIGUANKEJI
ST. LUKE'S MEDICAL CENTER
NETFLIX
CEBU PACIFIC
SOUTH SUPERMARKET
G CENTER AZUELA D S DV
PETRON
MAKATI SUPT
STN WDF MAIN
SPORTINO CLERIGOS
DEF/22274034/
PCK EL MOLITO
TRUE VALUE
TUMI
M BAKERY
BDO/70096685/
STROKESBYMOMOI SUPE VE
GRAB
SPOTIFY
VIATOR EU PHP
SKYRENTAL
BDO/00664921/
LYMA BRISKET CIDENG TIMUR
DEF/30048995/D
GCASH
SHELL
ROBINSONS DEPARTMENT STORE
BROWNS CRAFTHOUSE SEYM
KAWAGUCHIKO STATION
YO SUSHI
MERALCO
BDO/00091049/
BLUEWATER PANGLAO
MYEG
AIA PHILS
TGP INTERNATIONAL DMCC
ALL ABOUT BAKING
MAYNILAD WATER SERVICES INC
ANIMAL KINGDOM
BDO/00446977/
VANITY SALON R MAGNOL
BDO/00008508/
DEF/73330948/T
DEF/30024468/D
BPI EXPRESS CARD CORP
MOS BURGER
UCC
REAL CANADIAN SUPERSTORE
PICK A ROO
WCD EDUCATION & DEVELOPME
NCCC SUPERMARKET
DEF/60854248/C
SUMO SALAD GREEN ON GR
STRADIVARIUS
GPP/51139779/-
MARKETPLACE|MP
ETSY COM
EMERALD SEAFOOD PALACE
MO COOKIES POWERPLANT
NWR VICTORIA
MAZENDO
ST. PAUL LIBRERIA
SIGNET
SKYROAM INC
SINGAPORE AIRLINES
ACE HAR

In [50]:
# Compute Personalized PageRank for a sample merchant

#Use case: Cebu Pacific
cebu_pac = merchants[5] # merchants[5] refers to Cebu Pacific
pagerank_scores = nx.pagerank(G, personalization={cebu_pac: 1})

# Extract top similar merchants
similar_merchants = sorted([(m, score) for m, score in pagerank_scores.items() if m in merchants], key=lambda x: -x[1])[:10]

# Output recommendations
print("Top 10 similar merchants to:", cebu_pac)
print(similar_merchants)

Top 10 similar merchants to: CEBU PACIFIC
[('CEBU PACIFIC', 0.17442494940308148), ('DEF/22285424/B', 0.010818064097717912), ("NONO'S", 0.007939132395623313), ('NO_DATA', 0.006706942937657524), ('TIM HO WAN', 0.005326920383295884), ('AMAZON', 0.004200931934152568), ('TUDOR PODIUM', 0.004092846951795025), ('BPI EXPRESS CARD CORP', 0.003170618035708571), ('THREE SALCEDO PLACE CONDO CORP', 0.0030031628456018418), ('GLOBE TELECOM', 0.0028975642578114285)]


In [62]:
#Use case: Netflix
netflix = merchants[4] # merchants[4] refer to Netflix
pagerank_scores = nx.pagerank(G, personalization={netflix: 1})

# Extract top similar merchants
similar_merchants = sorted([(m, score) for m, score in pagerank_scores.items() if m in merchants], key=lambda x: -x[1])[:10]

# Output recommendations
print("Top 10 similar merchants to:", netflix)
print(similar_merchants)

Top 10 similar merchants to: NETFLIX
[('NETFLIX', 0.18272931227094513), ('AXILLEION', 0.01184556312603202), ('MANILA WATER COMPANY INC', 0.005932240316571402), ('NO_DATA', 0.0038282804449895607), ('GCASH', 0.0035769247746589438), ("CONTI'S", 0.0030437236619705937), ('HILTON GIFT SHOP', 0.002923803012333296), ('AMK US OPEN MERCHANDISE', 0.002847691205461593), ('MERCURY DRUG', 0.00277419994459831), ('GRAB', 0.0027430731998549845)]
