<a href="https://colab.research.google.com/github/victorwung/twn/blob/master/twn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data
- 第一個欄位SUBSCR_ID的正負號代表兩種不同的用戶(用戶編碼)，其他的欄位你可以望文生義猜測他的意思，或許你可以對這份資料做：
- 1.EDA 看看資料間的關係，找出一些insight
- 2.因為這裡面每筆資料代表的是每一個不同的人，你可以做個cluster 分群，並說明為什麼這樣分群，以及每群以白話文描述其特質嗎？

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# read data
raw_data = pd.read_csv('twm_ds_test.csv')
# raw_data = pd.read_csv('twn/twm_ds_test.csv')

In [4]:
raw_data.shape

(25000, 42)

## Process

In [5]:
# process
raw_data.columns = raw_data.columns.str.lower()
# data = data.fillna(0)

In [6]:
%%time
for col in raw_data.columns[1:]:
    raw_data[col] = raw_data[col].fillna(raw_data[col].median())
#     print('col {} done'.format(col))
print('Fill N/A done.')

Fill N/A done.
CPU times: user 36.5 ms, sys: 0 ns, total: 36.5 ms
Wall time: 47 ms


In [7]:
# data.info()

In [8]:
%%time
for col in raw_data.columns[1:]:
    q1_val = np.percentile(raw_data[col], 25)
    q4_val = np.percentile(raw_data[col], 75)
#     print('col:{}, q1:{}, q4:{}'.format(col, q1_val, q4_val))
    col_type = col.replace('_score', '')  
#     col_type = col.replace('score', 'type')
    raw_data[col_type] = raw_data.apply(lambda x: 1 if x[col] <= q1_val else (3 if x[col] >= q4_val else 2), axis=1)
#     print('col {} done'.format(col_type))
print('Assign type done.')

Assign type done.
CPU times: user 12.8 s, sys: 62.2 ms, total: 12.8 s
Wall time: 12.9 s


In [9]:
raw_data.head(1)

Unnamed: 0,subscr_id,taxi_driver_score,buddhism_score,pregnant_score,dating_score,wealth_score,married2020_score,gamble_score,kid_above6_score,high_school2021_score,...,kindness,medicalcare,christian,investor,deliverman,health_life,dpp,sex_demand,camper,married2021
0,6279363,0.113409,0.848155,0.281564,0.257194,0.393156,0.290093,0.468732,0.730413,0.062494,...,1,3,3,1,1,2,3,1,3,3


## Extract feature types

In [10]:
keep_cols = ['subscr_id'] + raw_data.columns[42:].tolist()

In [11]:
data = raw_data[keep_cols].copy()

In [12]:
data.shape

(25000, 42)

## Clustering

In [13]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
# from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import KMeans

In [14]:
# X, y = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
# plt.scatter(X[:,0], X[:,1])

In [15]:
# features
X = data.iloc[:, 1:]

In [16]:
# %%time
# # kmeans
# wcss = []
# max_num_cluster = 20
# for i in range(1, max_num_cluster+1, 1):
#     kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
#     kmeans.fit(X)
#     wcss.append(kmeans.inertia_)
# plt.plot(range(1, max_num_cluster+1, 1), wcss)
# plt.title('Elbow Method')
# plt.xlabel('Number of clusters')
# plt.ylabel('WCSS')
# plt.show()

In [17]:
%%time
# run kmeans
num_final_cluster = 7
kmeans = KMeans(n_clusters=num_final_cluster, init='k-means++', max_iter=300, n_init=10, random_state=0)
pred_y = kmeans.fit_predict(X)

CPU times: user 4.96 s, sys: 791 ms, total: 5.75 s
Wall time: 2.97 s


In [18]:
data.insert(0, 'cluster_id', '')
data['cluster_id'] = pred_y

In [20]:
data.head()

Unnamed: 0,cluster_id,subscr_id,taxi_driver,buddhism,pregnant,dating,wealth,married2020,gamble,kid_above6,...,kindness,medicalcare,christian,investor,deliverman,health_life,dpp,sex_demand,camper,married2021
0,0,6279363,1,3,1,1,2,1,2,3,...,1,3,3,1,1,2,3,1,3,3
1,1,58206281,3,1,2,3,1,3,1,1,...,1,1,1,2,3,2,1,1,1,3
2,4,7046864,2,3,3,3,3,3,2,3,...,3,2,3,3,1,3,2,3,3,3
3,0,11002637,2,1,1,1,1,1,2,2,...,3,2,1,1,1,1,3,3,2,2
4,0,60820330,1,2,1,2,1,1,1,2,...,1,3,2,1,1,1,2,3,1,1


In [21]:
data.iloc[:, 2:] = data.iloc[:, 2:].astype(str).apply(lambda x : x.name + '_' + x)

In [22]:
data.head()

Unnamed: 0,cluster_id,subscr_id,taxi_driver,buddhism,pregnant,dating,wealth,married2020,gamble,kid_above6,...,kindness,medicalcare,christian,investor,deliverman,health_life,dpp,sex_demand,camper,married2021
0,0,6279363,taxi_driver_1,buddhism_3,pregnant_1,dating_1,wealth_2,married2020_1,gamble_2,kid_above6_3,...,kindness_1,medicalcare_3,christian_3,investor_1,deliverman_1,health_life_2,dpp_3,sex_demand_1,camper_3,married2021_3
1,1,58206281,taxi_driver_3,buddhism_1,pregnant_2,dating_3,wealth_1,married2020_3,gamble_1,kid_above6_1,...,kindness_1,medicalcare_1,christian_1,investor_2,deliverman_3,health_life_2,dpp_1,sex_demand_1,camper_1,married2021_3
2,4,7046864,taxi_driver_2,buddhism_3,pregnant_3,dating_3,wealth_3,married2020_3,gamble_2,kid_above6_3,...,kindness_3,medicalcare_2,christian_3,investor_3,deliverman_1,health_life_3,dpp_2,sex_demand_3,camper_3,married2021_3
3,0,11002637,taxi_driver_2,buddhism_1,pregnant_1,dating_1,wealth_1,married2020_1,gamble_2,kid_above6_2,...,kindness_3,medicalcare_2,christian_1,investor_1,deliverman_1,health_life_1,dpp_3,sex_demand_3,camper_2,married2021_2
4,0,60820330,taxi_driver_1,buddhism_2,pregnant_1,dating_2,wealth_1,married2020_1,gamble_1,kid_above6_2,...,kindness_1,medicalcare_3,christian_2,investor_1,deliverman_1,health_life_1,dpp_2,sex_demand_3,camper_1,married2021_1


## Cluster Distribution

In [90]:
cluster_distribution = data[['cluster_id', 'subscr_id']].groupby('cluster_id')['subscr_id'].count().reset_index().rename({'subscr_id': 'cnt'})

In [36]:
cluster_distribution

Unnamed: 0,cluster_id,subscr_id
0,0,3748
1,1,2764
2,2,2945
3,3,5408
4,4,4007
5,5,3063
6,6,3065


## Apriori

In [38]:
# apriori
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

### select a clutser

In [61]:
cluster_no = 0
one_clutser_data = data[data['cluster_id']==cluster_no].copy().reset_index(drop=True)

In [62]:
# skip cluster_id, subscr_id
one_clutser_data_features = one_clutser_data.iloc[:, 2:].copy()


In [63]:
one_clutser_data_features.shape

(3748, 41)

In [64]:
# list of list
dataset = one_clutser_data_features.values.tolist() 

### run Apriori

In [70]:
### encode and fit
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df_tx = pd.DataFrame(te_ary, columns=te.columns_)

In [72]:
df_tx.shape

(3748, 123)

In [87]:
### setting threshold
threshold_support = 0.7
threshold_confidence = 0.7 
threshold_lift = 1

In [88]:
### frequent_itemsets
frequent_itemsets = apriori(df_tx, min_support=threshold_support, use_colnames=True)

In [89]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.725454,(age_3)
1,0.78842,(college_student2020_1)
2,0.705176,(jolin_1)
3,0.709445,(ktv_1)
4,0.825507,(lgbt_1)
5,0.733991,(married2021_1)
6,0.725454,(newbaby_1)


In [79]:
### association_rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

In [84]:
### lift>1 的結果
# target_rules = rules[(rules['lift'] > threshold_lift)].reset_index(drop=True)
target_rules = rules[(rules['lift'] >= threshold_lift)&(rules['confidence'] >= threshold_confidence)].reset_index(drop=True)


## Function get frequent itemsets

In [97]:
def get_clutser_frequent_itemset(data, cluster_no, threshold_support):
  ### preprocess
  # select one cluster
  one_clutser_data = data[data['cluster_id']==cluster_no].copy().reset_index(drop=True)
  # skip cluster_id, subscr_id
  one_clutser_data_features = one_clutser_data.iloc[:, 2:].copy()
  print('cluster {} shape: {}'.format(cluster_no, one_clutser_data_features.shape))
  # list of list
  dataset = one_clutser_data_features.values.tolist()

  ### encode and fit
  te = TransactionEncoder()
  te_ary = te.fit(dataset).transform(dataset)
  df_tx = pd.DataFrame(te_ary, columns=te.columns_)

  ### frequent_itemsets
  frequent_itemsets = apriori(df_tx, min_support=threshold_support, use_colnames=True)
  print('cluster {} done'.format(cluster_no))

  return frequent_itemsets

In [98]:
### setting threshold
threshold_support = 0.7
# threshold_confidence = 0.7 
# threshold_lift = 1

In [100]:
%%time
# run every cluster
cluster_0_frequent_itemsets = get_clutser_frequent_itemset(data, 0, threshold_support)
cluster_1_frequent_itemsets = get_clutser_frequent_itemset(data, 1, threshold_support)
cluster_2_frequent_itemsets = get_clutser_frequent_itemset(data, 2, threshold_support)
cluster_3_frequent_itemsets = get_clutser_frequent_itemset(data, 3, threshold_support)
cluster_4_frequent_itemsets = get_clutser_frequent_itemset(data, 4, threshold_support)
cluster_5_frequent_itemsets = get_clutser_frequent_itemset(data, 5, threshold_support)
cluster_6_frequent_itemsets = get_clutser_frequent_itemset(data, 6, threshold_support)

cluster 0 shape: (3748, 41)
cluster 0 done
cluster 1 shape: (2764, 41)
cluster 1 done
cluster 2 shape: (2945, 41)
cluster 2 done
cluster 3 shape: (5408, 41)
cluster 3 done
cluster 4 shape: (4007, 41)
cluster 4 done
cluster 5 shape: (3063, 41)
cluster 5 done
cluster 6 shape: (3065, 41)
cluster 6 done
CPU times: user 20.6 s, sys: 281 ms, total: 20.8 s
Wall time: 20.9 s


## Observe Results

In [106]:
cluster_0_frequent_itemsets

Unnamed: 0,support,itemsets
0,0.725454,(age_3)
1,0.78842,(college_student2020_1)
2,0.705176,(jolin_1)
3,0.709445,(ktv_1)
4,0.825507,(lgbt_1)
5,0.733991,(married2021_1)
6,0.725454,(newbaby_1)


In [107]:
cluster_1_frequent_itemsets

Unnamed: 0,support,itemsets
0,0.845514,(age_1)
1,0.722142,(buddhism_1)
2,0.763386,(college_student2020_3)
3,0.788712,(hanfan_1)
4,0.789074,(jolin_3)
5,0.800289,(lgbt_3)
6,0.713459,(married2021_3)


In [109]:
cluster_4_frequent_itemsets

Unnamed: 0,support,itemsets
0,0.746943,(beauty_clinic_3)
1,0.800349,(cat_lover_3)
2,0.757674,(cosmetic_3)
3,0.790616,(dog_lover_3)
4,0.798852,(estate_mortgage_3)
5,0.757674,(health_life_3)
6,0.707512,(investor_3)
7,0.790866,(jolin_3)
8,0.747692,(kid_under6_3)
9,0.716746,(ktv_3)


In [108]:
cluster_5_frequent_itemsets

Unnamed: 0,support,itemsets
0,0.791707,(buddhism_3)
1,0.706497,(christian_3)
2,0.708782,(kid_above6_3)
