In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
sns.set_theme()

### Load Data

In [26]:
# Load cluster df
df = pd.read_parquet('../data/clustered_MSISDN.parquet')
df.set_index('MSISDN', inplace=True)

In [2]:
df.head()

In [28]:
# Select cluster 0
df_1 = df[df['label'] == 0]
df_2 = df[df['label'] == 4]
del df

### Recommend packages based on similar customers

Cross-selling from cluster 0 (recurring customers) -> cluster 4 (One time, short package, repeat purchaser, a lot of transactions)

Select a customer from specific cluster for doing recommendation

In [38]:
MSISDN = np.random.choice(df_2.index)
print(MSISDN)

9928335864


Find similar number from cluster 0 

In [39]:
# Find similarity of selected number
x = df_2.loc[MSISDN].values.reshape(1, -1)
y = df_1.values
MSISDN_sim = cosine_similarity(x, y)

In [40]:
# Get top n similar numbers
n = 20

MSISDN_sim = pd.DataFrame(MSISDN_sim, columns=df_1.index).T.rename(columns={0: 'similarity'})
MSISDN_sim = MSISDN_sim.sort_values('similarity', ascending=False).head(n)

Calculate package score from top 10 similar number

In [41]:
# Load unique pack per MSISDN, for calculating package weight (score)
unique_pack_per_MSISDN = pd.read_parquet('../data/trx_unique_pack_per_MSISDN.parquet')
unique_pack_per_MSISDN = unique_pack_per_MSISDN.merge(MSISDN_sim, on='MSISDN')

In [42]:
# Calculate package score

# Calculate package weight from buying trx count
unique_pack_per_MSISDN['COUNT'] = 0
unique_pack_per_MSISDN = unique_pack_per_MSISDN.groupby('MSISDN', as_index=False).agg({
    'PACKAGE_CODE': 'first',
    'similarity': 'first',
    'COUNT': 'count'
})

# pivotting
unique_pack_per_MSISDN_pivot = pd.pivot_table(unique_pack_per_MSISDN, index='MSISDN', columns='PACKAGE_CODE', values='COUNT')
unique_pack_per_MSISDN_pivot = unique_pack_per_MSISDN_pivot.merge(unique_pack_per_MSISDN[['MSISDN', 'similarity']], left_index=True, right_on='MSISDN')
unique_pack_per_MSISDN_pivot.set_index('MSISDN', inplace=True)

# multiply count with similarity
sim_col = unique_pack_per_MSISDN_pivot['similarity']
unique_pack_per_MSISDN_pivot.fillna(0, inplace=True)
unique_pack_per_MSISDN_pivot = unique_pack_per_MSISDN_pivot.drop(columns='similarity').multiply(sim_col, axis=0)

# Package score
package_score = unique_pack_per_MSISDN_pivot.sum()/sim_col.sum()

Get recommended packages

In [3]:
# List of recommended package for selected number
package_score.sort_values(ascending=False)