In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
import sklearn

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import OrdinalEncoder
from sklearn import metrics
from scipy.spatial.distance import cdist
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [1]:
class ModelCLV_Base:
  def __init__(self, alpha, beta, gamma):
    self.alpha = alpha
    self.beta = beta
    self.gamma = gamma

  def _revenue_value(self, r_balance, r_loan, r_credit):

    revenue = r_balance * self.alpha + r_loan * self.beta + r_credit * self.gamma

    return revenue

  def predict(self, data):
    result = []
    data = np.array(data)
    for row in data:
      r_balance = row[0]
      r_loan = row[1]
      r_credit = row[2]
      life_time = row[3]
      revenue = self._revenue_value(r_balance, r_loan, r_credit)
      result.append(revenue * life_time)

    return result


In [None]:
### segment data
data = pd.read_csv('/content/drive/MyDrive/workspace/VPBank_Hackathon/src/data_preprocessing/data_train_ver2.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/workspace/VPBank_Hackathon/src/data_preprocessing/data_train_ver2.csv'

In [None]:
avg_diff_date_account = data['diff_date'].mean()

In [None]:
data['diff_date_preprocess'] = data['diff_date'].apply(lambda x: avg_diff_date_account if x < avg_diff_date_account else x)

In [None]:
data

In [None]:
data['amount_cash_out'] = data['amount_cash_out'].fillna(0)
data['amount_cash_in'] = data['amount_cash_in'].fillna(0)
data['trans_cash_out_count'] = data['trans_cash_out_count'].fillna(0)
data['trans_cash_in_count'] = data['trans_cash_in_count'].fillna(0)


In [None]:
data.columns

In [None]:
data

In [None]:
data['payments']

# 2.1 Phân cụm dữ liệu

In [None]:
def plot_histogram(data, title):

  sns.histplot(data=data)
  # displaying the title
  plt.title(title)
  plt.show()


In [None]:
plot_histogram(data['age'], 'age')

In [None]:
# ver1.0
# columns_cluster = ['age', 'state', 'city', 'district_id', 'sex', 'balance_last', 'diff_date', 'is_has_loan', 'is_has_card']
# ver2.0
#columns_cluster = ['age', 'state', 'city', 'district_id', 'sex', 'is_has_loan', 'is_has_card']

#ver 3.0
# columns_cluster = ['age', 'state', 'city', 'district_id', 'sex', 'balance_last', 'trans_count','amount_credit','is_has_loan', 'is_has_card']

#ver 4.0
columns_cluster = ['age','balance_last', 'trans_count','amount_credit', \
                   'amount_cash_out', 'amount_cash_in', 'trans_cash_out_count', 'trans_cash_in_count']

# category_columns = ['city',]

In [None]:
for column in columns_cluster:
  plot_histogram(data[column], column)

In [None]:
z_score_feature = ['balance_last', 'age']
min_max_feature = ['trans_count','amount_credit', \
                   'amount_cash_out', 'amount_cash_in', 'trans_cash_out_count', 'trans_cash_in_count']

In [None]:
data_cluster = data[columns_cluster]
enc = OrdinalEncoder()
# data_cluster[category_columns] = enc.fit_transform(data_cluster[category_columns])

z_scaler = StandardScaler()
min_max_scaler  = MinMaxScaler()

data_cluster[z_score_feature] = z_scaler.fit_transform(data_cluster[z_score_feature])
data_cluster[min_max_feature] = min_max_scaler.fit_transform(data_cluster[min_max_feature])

In [None]:
# find correlation of feature
plt.figure(figsize =(10,10))
dataplot = sns.heatmap(data_cluster.corr(), cmap="YlGnBu", annot=True)
plt.show()

In [None]:
data_cluster

In [None]:
# data_cluster['diff_date'].plot(kind='hist', bins=100)

In [None]:
distortions = []
inertias = []
mapping1 = {}
mapping2 = {}
K = range(1, 10)

for k in K:
    # Building and fitting the model
    kmeanModel = KMeans(n_clusters=k, algorithm='elkan')
    kmeanModel.fit(data_cluster)

    distortions.append(sum(np.min(cdist(data_cluster, kmeanModel.cluster_centers_,
                                        'euclidean'), axis=1)) / data_cluster.shape[0])
    inertias.append(kmeanModel.inertia_)

    mapping1[k] = sum(np.min(cdist(data_cluster, kmeanModel.cluster_centers_,
                                   'euclidean'), axis=1)) / data_cluster.shape[0]
    mapping2[k] = kmeanModel.inertia_

In [None]:

plt.plot(K, distortions, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('The Elbow Method using Distortion')
plt.show()

Chọn k = 3

In [None]:
kmeanModel_final = KMeans(n_clusters=3)
kmeanModel_final.fit(data_cluster)

In [None]:
kmeanModel_final

In [None]:
y_kmeans = kmeanModel_final.predict(data_cluster)

In [None]:
y_kmeans

In [None]:
data_cluster['cluster'] = y_kmeans

## Kiểm thử mô hình

In [None]:
data_cluster['diff_date'] = data['diff_date_preprocess']

In [None]:
from sklearn.decomposition import PCA
k_means_label_0 = data_cluster[data_cluster['cluster'] == 0]
k_means_label_1 = data_cluster[data_cluster['cluster'] == 1]
k_means_label_2 = data_cluster[data_cluster['cluster'] == 2]
#k_means_label_3 = data_cluster[data_cluster['cluster'] == 3]


In [None]:
# find correlation of feature
plt.figure(figsize =(10,10))
dataplot = sns.heatmap(data_cluster.corr(), cmap="YlGnBu", annot=True)
plt.show()

In [None]:
def plot_histogram_cluster(column):
  plt.figure(figsize=(10, 5))
  sns.histplot(data=k_means_label_0[column], color="skyblue", label="Cluster 1", kde=True, element="bars")
  sns.histplot(data=k_means_label_1[column], color="red", label="Cluster 2", kde=True, element="bars")
  sns.histplot(data=k_means_label_2[column], color="yellow", label="Cluster 3", kde=True, element="bars")
  #sns.histplot(data=k_means_label_3[column], color="green", label="Cluster 4", kde=True, element="bars")
  plt.legend()
  plt.show()

In [None]:
plot_histogram_cluster('balance_last')

In [None]:
#plot_histogram_cluster('balance_last')

In [None]:
plot_histogram_cluster('diff_date')

In [None]:
#plot_histogram_cluster('state')

In [None]:
data_cluster[columns_cluster]

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(data_cluster[columns_cluster])
data_pca = pca.transform(data_cluster[columns_cluster])

In [None]:
data_pca

In [None]:
# giving a larger plot
plt.figure(figsize=(8, 6))

plt.scatter(data_pca[:, 0], data_pca[:, 1],
            c=data_cluster['cluster'],
            cmap='plasma')

# labeling x and y axes
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.show()

In [None]:
mean_diff_date_label_1 = k_means_label_1['diff_date'].mean()
mean_diff_date_label_2 = k_means_label_2['diff_date'].mean()
mean_diff_date_label_0 = k_means_label_0['diff_date'].mean()
#mean_diff_date_label_3 = k_means_label_3['diff_date'].mean()

In [None]:
#mean_diff_date_label_3

In [None]:
def mean_diff_date_label(x):
  if x['cluster'] == 0:
    return mean_diff_date_label_0
  elif x['cluster'] == 1:
    return mean_diff_date_label_1
  elif x['cluster'] == 2:
    return mean_diff_date_label_2
#  elif x['cluster'] == 3:
#    return mean_diff_date_label_3

In [None]:
data_cluster['lifetime_value'] = data_cluster.apply(lambda x: mean_diff_date_label(x), axis=1)

In [None]:
data_cluster

In [None]:
plot_histogram_cluster('diff_date')

In [None]:
data_cluster.head()

In [None]:
data['ltv'] = data_cluster['lifetime_value']

In [None]:
data.columns

In [None]:
data_train = data[['balance_last', 'amount_loan', 'amount_credit', 'ltv']]

In [None]:
data_train = data_train.fillna(0)

In [None]:
clv_base_model = ModelCLV_Base(data_train=0.2, beta=0.4, gamma=0.2)

In [None]:
y_pred = clv_base_model.predict(data)

In [None]:
y_pred