In [1]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
import warnings; warnings.filterwarnings("always"); warnings.filterwarnings(action='ignore')
import matplotlib.pyplot as plt

# 한글 폰트 사용을 위해서 세팅
from matplotlib import font_manager, rc
font_path = "C:/Windows/Fonts/malgun.TTF"
font = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font)

# 데이터 전처리

In [6]:
import pandas as pd
import numpy as np

"""
merged 데이터를 입력으로 받음
    - 31만개 클러스터링 데이터 + 클러스터 라벨 + 이용금액
"""

def preprocess_counts(data):

    """
    클러스터 별 카운트의 비중
    """

    data = data[['년','월','cluster']]
    data['날짜'] = data['년'].astype(str) + '-' + data['월'].astype(str)
    data.drop(columns=['년','월'], inplace=True)

    count_dict = {}
    for key, value in data.groupby('날짜'):
        count_dict[key] = value['cluster'].value_counts().sort_index().values

    df = pd.DataFrame(count_dict)
    df = df[list(reversed(['2019-1', '2019-2', '2019-3', '2019-4', '2019-5', '2019-6', '2019-7', '2019-8', '2019-9', '2019-10',
            '2019-11', '2019-12', '2020-1', '2020-2', '2020-3', '2020-4']))]
    df = df.T
    df.columns = list(range(1,11))
    df = df.apply(lambda x: x/x.sum(), axis=1)
    df = df.reindex(index=df.index[::-1])

    return df


def preprocess_money_old(data_in):

    """
    클러스터 별 이용금액의 비중
    """

    data = data_in.copy()

    data = data[['년','월','cluster', '이용금액']]
    data['날짜'] = data['년'].astype(str) + '-' + data['월'].astype(str)
    data.drop(columns=['년','월'], inplace=True)

    count_dict = {}
    for key, value in data.groupby('날짜'):
        count_dict[key] = value.groupby('cluster').sum('이용금액')['이용금액'].values

    df = pd.DataFrame(count_dict)
    df = df[list(reversed(['2019-1', '2019-2', '2019-3', '2019-4', '2019-5', '2019-6', '2019-7', '2019-8', '2019-9', '2019-10',
            '2019-11', '2019-12', '2020-1', '2020-2', '2020-3', '2020-4']))]
    df = df.T
    df.columns = list(range(1,11))
    df = df.apply(lambda x: x/x.sum(), axis=1)
    df = df.reindex(index=df.index[::-1])

    return df

def preprocess_money(data_in, k):

    """
    클러스터 별 이용금액의 비중
    """
    data = data_in.copy()

    data = data[['년', '월', 'cluster', '이용금액']]
    data['날짜'] = data['년'].astype(str) + '-' + data['월'].astype(str)
    data.drop(columns=['년', '월'], inplace=True)

    count_dict = {}
    for key, item in data.groupby('날짜'):

        date_list_1 = sorted(np.arange(1, 11))  # [1,2,3,...,10]
        date_list_2 = sorted(item['cluster'].unique())  # [1,3,...,9]
        complement = list(set(date_list_1) - set(date_list_2))  # 누락된 클러스터 [2,10]

        for i in complement:

            temp = pd.DataFrame([(key, i, 0)], columns=[
                                '날짜', 'cluster', '이용금액'])  # 누락된 날짜에 0 추가
            item = item.append(temp, ignore_index=True)

        item.set_index(item['cluster'], inplace=True)
        item.index.name = None

        count_dict[key] = item.sort_values('cluster')

        count_dict[key] = count_dict[key].groupby('cluster').sum('이용금액')

    df = pd.concat(count_dict, axis=1)
    df = df[list(reversed(['2019-1', '2019-2', '2019-3', '2019-4', '2019-5', '2019-6', '2019-7', '2019-8', '2019-9', '2019-10',
            '2019-11', '2019-12', '2020-1', '2020-2', '2020-3', '2020-4']))]

    df = df.T.droplevel(level=1)  # multi index 제거

    # 미존재 클러스터에 0 채워넣기
    date_list_1 = sorted(np.arange(1, k+1))  # [1,2,3,...,10]
    date_list_2 = sorted(df.columns.unique())  # [1,3,...,9]
    complement = list(set(date_list_1) - set(date_list_2))  # 누락된 클러스터 [2,10]
    for i in complement:
        df[i] = 0

    #df.columns = list(range(1, 11))
    df = df.apply(lambda x: x/x.sum(), axis=1)
    df = df.reindex(index=df.index[::-1])
    df

    return df


def preprocess_money_abs(data):

    """
    클러스터 별 이용금액의 절대값
    """

    data = data[['년','월','cluster', '이용금액']]
    data['날짜'] = data['년'].astype(str) + '-' + data['월'].astype(str)
    data.drop(columns=['년','월'], inplace=True)

    count_dict = {}
    for key, value in data.groupby('날짜'):
        count_dict[key] = value.groupby('cluster').sum('이용금액')['이용금액'].values

    df = pd.DataFrame(count_dict)
    df = df[list(reversed(['2019-1', '2019-2', '2019-3', '2019-4', '2019-5', '2019-6', '2019-7', '2019-8', '2019-9', '2019-10',
            '2019-11', '2019-12', '2020-1', '2020-2', '2020-3', '2020-4']))]
    df = df.T
    df.columns = list(range(1,11))
    #df = df.apply(lambda x: x/x.sum(), axis=1)
    df = df.reindex(index=df.index[::-1])

    return df


# 월별 비중변화 그래프

In [14]:
from matplotlib import pyplot as plt

def plot_stackedbar_h(data, colors, title, subtitle):

    """
    위 preprocess 함수 결과 나온 데이터를 입력으로 받음

    예시)
    df = preprocess_money(merged)
    plot_stackedbar_h(df, labels, colors, title='고객 전체 \n', subtitle='금액 비율')
    """

    df = data.iloc[::-1]

    fields = df.columns.tolist() # [1, 2, ..., 10]
    
    # figure and axis
    fig, ax = plt.subplots(1, figsize=(12, 10))
    
    # plot bars
    left = len(df) * [0] # [0, 0, ..., 0]
    for idx, name in enumerate(fields):
        plt.barh(df.index, df[name], left = left, color=colors[idx]) 
        left = left + df[name]
    
    # title and subtitle
    plt.title(title, loc='left', fontsize=20)
    plt.text(0, ax.get_yticks()[-1] + 0.75, subtitle)
    
    # legend
    labels = list('cluster_' + str(i) for i in range(1,11))
    plt.legend(labels, bbox_to_anchor=([0.58, 1, 0, 0]), ncol=4, frameon=False)
    
    # remove spines
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    
    # format x ticks
    xticks = np.arange(0, 1.1, 0.1)
    xlabels = ['{}%'.format(i) for i in np.arange(0, 101, 10)]
    plt.xticks(xticks, xlabels)

    # adjust limits and draw grid lines
    plt.ylim(-0.5, ax.get_yticks()[-1] + 0.5)
    ax.xaxis.grid(color='gray', linestyle='dashed')

    plt.show()
    

colors = ['#1D2F6F', '#8390FA', '#6EAF46', '#FAC748',
          '#7D0033', '#FFC1CB', '#B66DFF', '#4B4C4E', 
          '#AF916D', '#46ABB0']


In [None]:
def plot_stackedbar_v(data, colors, title):

    """
    위 preprocess 함수 결과 나온 데이터를 입력으로 받음
    """

    df = data.copy()

    fig, ax = plt.subplots(1, figsize=(16, 8))

    for k in df.columns:

        if k == 1:
            bottom = pd.Series(np.zeros(16))

        elif k >= 2:
            bottom = pd.Series(np.zeros(16))
            for i in range(1,k):
                bottom += df[i].values
        
        plt.bar(df.index, df[k], bottom = bottom, color = colors[k-1], width =0.5)

    # x and y limits
    #plt.xlim(-1, 16) # -0.6, 10.5
    #plt.ylim(0, 1)

    # remove spines
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)

    #grid
    ax.set_axisbelow(True)
    ax.yaxis.grid(color='gray', linestyle='dashed', alpha=0.7)

    # x ticks
    #dates = ['2019-1', '2019-2', '2019-3', '2019-4', '2019-5', '2019-6', '2019-7', '2019-8', '2019-9', '2019-10', '2019-11', '2019-12', '2020-1', '2020-2', '2020-3', '2020-4']
    #xticks_labels = dates.reverse()
    plt.xticks(df.index , labels = df.index)

    # title and legend
    legend_label = list('cluster_' + str(i) for i in range(1,11))
    plt.legend(legend_label, ncol = 1, bbox_to_anchor=([1, 1.00, 0, 0]), frameon = False)
    plt.title(title, loc='center', fontsize=20)
    #plt.show()
    plt.savefig('./data/clustering_results/ver1/barplot/{0}'.format(title))
        

colors = ['#1D2F6F', '#8390FA', '#6EAF46', '#FAC748',
          '#7D0033', '#FFC1CB', '#B66DFF', '#4B4C4E', 
          '#AF916D', '#46ABB0']


# 데이터 필터링 (성별, 연령, 소득, 지역)

In [8]:
def filter_df(df, gender=None, age=None, income=None, region=None):

    """
    분석 대상 데이터만 필터링
    """
    
    filtered = df.copy()

    if gender is not None:
        mask = filtered['성별'].isin(gender)
        filtered = filtered[mask]
    if age is not None:
        mask = filtered['연령대별'].isin(age)
        filtered = filtered[mask]
    if income is not None:
        mask = filtered['연평균소득추정'].isin(income)
        filtered = filtered[mask]
    if region is not None:
        mask = filtered['지역'].isin(region)
        filtered = filtered[mask]
    
    return filtered        

# 데이터프레임 색칠

In [9]:
import seaborn as sns

def color_df(df):

    cm = sns.light_palette("orange", as_cmap=True)
    x = pd.DataFrame(df)
    x = x.style.background_gradient(cmap=cm, axis=1)
    display(x)

# transition matrix 만들기

In [None]:
def transition_matrix(data):

    # 그룹 별 월별 membership 리스트 만들기

    membership_dict = {}

    for key, value in data.groupby(['가맹점소재지1', '가맹점소재지2', '성별', '연령대별', '연평균소득추정', '지역']):
        membership_dict[key] = value['cluster'].values.tolist()

    # transition matrix 만들기 (9 min)

    transition_df = pd.DataFrame(columns = ['state', 'next_state']) 

    for key, value in tqdm(membership_dict.items()):
        for i, val in enumerate(value[:-1]): # We don't care about last state
            df_stg = pd.DataFrame(index=[0])
            df_stg['state'], df_stg['next_state'] = value[i], value[i+1]
            transition_df = pd.concat([transition_df, df_stg], axis = 0)

    print('total number of transitions: ', len(transition_df)) # 예를들어 16개월동안 모두 값이 존재했으면 15 transitions

    cross_tab = pd.crosstab(transition_df['state'], transition_df['next_state'])
    transition_matrix = cross_tab.div(cross_tab.sum(axis=1), axis=0)

    return transition_matrix

In [None]:
from hmmviz import TransGraph

def transition_graph(df):

    graph = TransGraph(df)

    fig = plt.figure(figsize=(10, 10))
        
    colors = {1: 'darkorange', 2: 'purple', 3: 'olive', 4:'dimgrey',
            5: 'darkblue', 6:'brown', 7:'black', 8:'red', 9:'thistle', 10:'salmon'}

    graph.draw( nodecolors=colors, edgecolors=colors, edgelabels=True,
                nodefontsize=16)

    plt.show()