### Level 5 Variables Creation

In [69]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse.linalg import svds
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [71]:
# Load your data
pd.set_option('display.max_columns', None)
dna = pd.read_parquet('dna_pw_20250225.parquet')
dna = dna.drop(columns=['CL_ID2', 'CL_ID4', 'HMO_MEMBER']) # Remove HMO_MEMBER due to insufficient data
dna.head()

Unnamed: 0,MASKED_ID_NUM,IDV_OCP_TYP_ID,NBR_DPND,GENERATION,GENDER,MARITAL_STATUS,DIGITAL_FLAG,RISK_APPETITE,TRAVELLER,GEODIVERSITY,REGION,HOSPITAL_PAYOR,ENVIRONMENTAL_AFF,HUMANITARIAN_AFF,RELIGIOUS_AFF,FILCHI_CLUB,OF_CLUB,RETIREES_CLUB,MILLENNIAL_CLUB,EXECUTIVES_CLUB,PROFESSIONAL_CLUB,NEW_MERCH_NAME,TXN_AMT_TOT,TXN_AMT_AVE,TXN_CNT,MOST_COMMON_INDUSTRY
0,4016083633,STUDENT,3.0,BOOMERS,FEMALE,MARRIED,TRADITIONAL,AGGRESSIVE,HIGH,DIVERSE,NATIONAL CAPITAL REGION,NO_DATA,N,N,N,N,Y,N,Y,N,N,IN ROOM DINING,80099.0,14531.0,1.0,Record Stores
1,4016083633,SELFEMPLOYED,3.0,GEN_X,MALE,MARRIED,DIGITAL,AGGRESSIVE,NO_DATA,SINGLE,NATIONAL CAPITAL REGION,LOW,N,N,N,N,Y,Y,N,N,N,VANS 756,41075.0,2753.0,2.0,Drug Stores And Pharmacies
2,4016083633,SELFEMPLOYED,3.0,BOOMERS,FEMALE,MARRIED,DIGITAL,NO_DATA,NO_DATA,DIVERSE,NATIONAL CAPITAL REGION,MID,N,N,N,N,Y,Y,N,N,N,ZHIGUANKEJI,0.0,14504.0,2.0,Grocery Stores And Supermarkets
3,4016083633,EMPLOYED,3.0,BOOMERS,FEMALE,MARRIED,DIGITAL,NO_DATA,NO_DATA,MIGRATORY,NATIONAL CAPITAL REGION,NO_DATA,N,N,N,N,Y,Y,N,Y,N,ST. LUKE'S MEDICAL CENTER,0.0,5338.0,2.0,Education
4,4016083633,SELFEMPLOYED,3.0,GEN_X,FEMALE,MARRIED,DIGITAL,NO_DATA,HIGH,DIVERSE,NATIONAL CAPITAL REGION,NO_DATA,N,N,N,N,N,N,N,N,N,NETFLIX,12695.0,0.0,2.0,No Industry Label


In [73]:
print(dna['RISK_APPETITE'].unique())
print(dna['GEODIVERSITY'].unique())
print(dna['DIGITAL_FLAG'].unique())
print(dna['HOSPITAL_PAYOR'].unique())
print(dna['ENVIRONMENTAL_AFF'].unique())

['AGGRESSIVE' 'NO_DATA' 'MODERATELY_CONSERVATIVE' 'MODERATELY_AGGRESSIVE'
 'CONSERVATIVE']
['DIVERSE' 'SINGLE' 'MIGRATORY' 'NO_DATA']
['TRADITIONAL' 'DIGITAL' 'NO_DATA']
['NO_DATA' 'LOW' 'MID' 'HIGH']
['N' 'Y']


#### Data Pre-Processing

In [76]:
risk_mapping = {
    'NO_DATA': -1,  # Assign -1 to indicate missing data
    'CONSERVATIVE': 1,
    'MODERATELY_CONSERVATIVE': 2,
    'MODERATELY_AGGRESSIVE': 3,
    'AGGRESSIVE': 4
}

dna['RISK_APPETITE'] = dna['RISK_APPETITE'].map(risk_mapping)

In [78]:
hospital_payor_mapping = {
    'NO_DATA': -1,  # Assign -1 to indicate missing data
    'LOW': 1,
    'MID': 2,
    'HIGH': 3
}

dna['HOSPITAL_PAYOR'] = dna['HOSPITAL_PAYOR'].map(hospital_payor_mapping)

In [80]:
geodiversity_mapping = {
    'NO_DATA': -1,
    'SINGLE': 1,
    'MIGRATORY': 2,
    'DIVERSE': 3
}

dna['GEODIVERSITY'] = dna['GEODIVERSITY'].map(geodiversity_mapping)

In [82]:
digital_flag_mapping = {
    'NO_DATA': -1,
    'TRADITIONAL': 0,
    'DIGITAL': 1
}

dna['DIGITAL_FLAG'] = dna['DIGITAL_FLAG'].map(digital_flag_mapping)

In [84]:
binary_cols = ['ENVIRONMENTAL_AFF', 'HUMANITARIAN_AFF', 'RELIGIOUS_AFF', 'FILCHI_CLUB', 'OF_CLUB', 
               'RETIREES_CLUB', 'MILLENNIAL_CLUB', 'EXECUTIVES_CLUB', 'PROFESSIONAL_CLUB']

for col in binary_cols:
    dna[col] = dna[col].map({'Y': 1, 'N': 0})

In [86]:
dna = pd.get_dummies(dna, columns=['GENERATION', 'GENDER'], drop_first=True)

In [88]:
dna.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85142 entries, 0 to 85141
Data columns (total 30 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   MASKED_ID_NUM         85142 non-null  int64  
 1   IDV_OCP_TYP_ID        85142 non-null  object 
 2   NBR_DPND              85142 non-null  float64
 3   MARITAL_STATUS        85142 non-null  object 
 4   DIGITAL_FLAG          85142 non-null  int64  
 5   RISK_APPETITE         85142 non-null  int64  
 6   TRAVELLER             85142 non-null  object 
 7   GEODIVERSITY          85142 non-null  int64  
 8   REGION                85142 non-null  object 
 9   HOSPITAL_PAYOR        85142 non-null  int64  
 10  ENVIRONMENTAL_AFF     85142 non-null  int64  
 11  HUMANITARIAN_AFF      85142 non-null  int64  
 12  RELIGIOUS_AFF         85142 non-null  int64  
 13  FILCHI_CLUB           85142 non-null  int64  
 14  OF_CLUB               85142 non-null  int64  
 15  RETIREES_CLUB      

#### Future High Spending Score

In [91]:
# Feature Engineering
dna['SPEND_PER_TXN'] = dna['TXN_AMT_TOT'] / (dna['TXN_CNT'] + 1)  # Avoid division by zero
dna['RECENT_SPEND_GROWTH'] = (
    (dna['TXN_AMT_TOT'] - dna['TXN_AMT_TOT'].rolling(window=3, min_periods=1).mean())
    / dna['TXN_AMT_TOT'].rolling(window=3, min_periods=1).mean()
).fillna(0)
dna['HIGH_SPENDER'] = (dna['TXN_AMT_TOT'] > dna['TXN_AMT_TOT'].quantile(0.75)).astype(int)  # Top 25% spenders

In [93]:
# Define Features & Target
features = ['TXN_AMT_TOT', 'TXN_CNT', 'SPEND_PER_TXN', 'RECENT_SPEND_GROWTH', 'RISK_APPETITE']
X = dna[features]
y = dna['HIGH_SPENDER']  # Target variable

# Define Pipeline
pipeline = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(n_estimators=100, random_state=42)
)

# Cross-Validation Setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Cross-Validation Scores
scores = cross_val_score(pipeline, X, y, cv=cv, scoring='f1_weighted')  
print(f"Cross-Validation F1 Scores: {scores}")
print(f"Mean F1 Score: {scores.mean():.4f}")

# Final Model Training & Evaluation
pipeline.fit(X, y)
y_pred = pipeline.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Cross-Validation F1 Scores: [1.         1.         1.         0.99994128 1.        ]
Mean F1 Score: 1.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12716
           1       1.00      1.00      1.00      4313

    accuracy                           1.00     17029
   macro avg       1.00      1.00      1.00     17029
weighted avg       1.00      1.00      1.00     17029



In [95]:
# Logistic Regression Pipeline
logreg_pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=1000, random_state=42)
)

# XGBoost Pipeline
xgb_pipeline = make_pipeline(
    StandardScaler(),
    XGBClassifier(n_estimators=100, max_depth=3, random_state=42, use_label_encoder=False, eval_metric='logloss')
)

# Cross-Validation Scores
logreg_scores = cross_val_score(logreg_pipeline, X, y, cv=cv, scoring='f1_weighted')
xgb_scores = cross_val_score(xgb_pipeline, X, y, cv=cv, scoring='f1_weighted')

print(f"Logistic Regression Mean F1 Score: {logreg_scores.mean():.4f}")
print(f"XGBoost Mean F1 Score: {xgb_scores.mean():.4f}")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Logistic Regression Mean F1 Score: 0.9845
XGBoost Mean F1 Score: 0.9997


In [97]:
# Train & Evaluate Final Models
logreg_pipeline.fit(X, y)
xgb_pipeline.fit(X, y)

y_pred_logreg = logreg_pipeline.predict(X)
y_pred_xgb = xgb_pipeline.predict(X)

print("\nLogistic Regression Classification Report:")
print(classification_report(y, y_pred_logreg))

print("\nXGBoost Classification Report:")
print(classification_report(y, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.




Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     63856
           1       0.99      0.96      0.97     21286

    accuracy                           0.99     85142
   macro avg       0.99      0.98      0.98     85142
weighted avg       0.99      0.99      0.99     85142


XGBoost Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     63856
           1       1.00      1.00      1.00     21286

    accuracy                           1.00     85142
   macro avg       1.00      1.00      1.00     85142
weighted avg       1.00      1.00      1.00     85142



In [99]:
# Assign Future High Spending Score to dna
dna['FUTURE_HIGH_SPENDING_SCORE'] = model.predict(scaler.transform(X))  # Predict for all customers

# Save results
dna.to_csv("future_high_spending_predictions.csv", index=False)
print("Saved results to future_high_spending_predictions.csv")

Saved results to future_high_spending_predictions.csv
