ライブラリのインポート．

In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm

データの読み込み．ここではすでにcompleteなデータであるものとする．

In [None]:
X = pd.read_excel("data.xlsx", index_col = 0)
display(X)

# 連続変数

In [None]:
factor1_list = ["x", "y", "z"]
factor2_list = ["u", "v"]

# factor 1
plt.figure(figsize = (4*len(factor1_list), 4))
for idx, factor in enumerate(factor1_list):
    plt.subplot(1, len(factor1_list), idx + 1)
    plt.hist(X[factor], bins = 20, label = factor)
    plt.xlabel(factor, fontsize = 18)
    plt.ylabel("frequency", fontsize = 18)
plt.tight_layout()

# factor 2
plt.figure(figsize = (4*len(factor2_list), 4))
for idx, factor in enumerate(factor2_list):
    plt.subplot(1, len(factor2_list), idx + 1)
    plt.hist(X[factor], bins = 20, label = factor)
    plt.xlabel(factor, fontsize = 18)
    plt.ylabel("frequency", fontsize = 18)
plt.tight_layout()

# ダミー変数

In [None]:
# 順序構造を記述したリスト，データはこの中の一部の値しか持っていないかもしれない
dummy1_full_list = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "AB"]
df_dummy1 = pd.DataFrame(index = dummy1_full_list, columns = ["number", "ratio"])
for dummy1 in dummy1_full_list:
    df_dummy1.at[dummy1, "number"] = sum(X["dummy1"] == dummy1)
    df_dummy1.at[dummy1, "ratio"] = sum(X["dummy1"] == dummy1) / len(X)
df_dummy1 = df_dummy1[df_dummy1["number"] != 0]

df_dummy1 = df_dummy1[::-1]
position = np.arange(len(df_dummy1))
plt.barh(position, df_dummy1["number"])
plt.xlabel("frequency", fontsize = 18)
plt.yticks(position, df_dummy1.index)

df_dummy1 = df_dummy1[::-1]
display(df_dummy1)

# 連続変数 vs 連続変数

In [None]:
# factor1 vs factor2
# factor1が行となり，factor2が列となるイメージ
plt.figure(figsize = (4*len(factor2_list), 4*len(factor1_list)))
for idx1, factor1 in enumerate(factor1_list):
    for idx2, factor2, in enumerate(factor2_list):
        plt.subplot(len(factor1_list), len(factor2_list), idx1*len(factor2_list) + idx2 + 1)
        plt.scatter(X[factor2], X[factor1], alpha = 0.4)
        plt.xlabel(factor2, fontsize = 18)
        plt.ylabel(factor1 , fontsize = 18)
        plt.title(factor2 + " vs " + factor1, fontsize = 24)
plt.tight_layout()


# ダミー変数 vs ダミー変数

In [None]:
dummy1_full_list = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "AB"]
dummy2_full_list = ["a", "b", "c", "d", "e"]

df_dummy1_dummy2 = pd.DataFrame(index = dummy1_full_list, columns = dummy2_full_list)
for dummy1 in dummy1_full_list:
    for dummy2 in dummy2_full_list:
        df_dummy1_dummy2.at[dummy1, dummy2] = sum( (X["dummy1"] == dummy1) & (X["dummy2"] == dummy2) )
df_dummy1_dummy2 = df_dummy1_dummy2

# データの入っていない列を削除
for column in df_dummy1_dummy2.columns:
    if (df_dummy1_dummy2[column] == 0).all():
        df_dummy1_dummy2.drop(column, axis = 1, inplace = True)

# データの入っていない行を削除
for idx in df_dummy1_dummy2.index:
    if (df_dummy1_dummy2.loc[idx] == 0).all():
        df_dummy1_dummy2.drop(idx, axis = 0, inplace = True)
#display(df_dummy1_dummy2)


plt.figure(figsize = (2*len(df_dummy1_dummy2.columns), len(df_dummy1_dummy2.index)))
sns.heatmap(df_dummy1_dummy2.astype(int), annot = True, fmt = "g", cmap = "Blues")
plt.savefig("heat_map.jpg")

# 連続変数 vs ダミー変数

In [None]:
factor_list = ["x", "y", "z"]

dummy_full_list = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "AB"]
df_dummy = pd.DataFrame(index = dummy1_full_list, columns = ["number", "ratio"])
for dummy in dummy_full_list:
    df_dummy.at[dummy, "number"] = sum(X["dummy1"] == dummy)
    df_dummy.at[dummy, "ratio"] = sum(X["dummy1"] == dummy) / len(X)
df_dummy = df_dummy[df_dummy["number"] != 0]

df_dummy = df_dummy[::-1]
plt.figure(figsize = (4*len(factor_list), len(df_dummy)))
for idx_factor, factor in enumerate(factor_list):
    plt.subplot(1, len(factor_list), idx_factor + 1)
    x_dummy = [[] for _ in range(len(df_dummy.index))]
    for idx_dummy, dummy in enumerate(df_dummy.index):
        x_dummy[idx_dummy] = X[X["dummy1"] == dummy][factor]
    plt.boxplot(x_dummy, vert = False, labels = df_dummy.index)
    plt.xlabel(factor, fontsize = 14)
    plt.ylabel("dummy1", fontsize = 14)
df_dummy = df_dummy[::-1]

# ここまでの結果を関数にする

## 連続変数

In [None]:
def get_hist(X, factor_list): # X：テーブル，factor_list：注目するファクターのリスト
    plt.figure(figsize = (4*len(factor_list), 4))
    for idx, factor in enumerate(factor_list):
        plt.subplot(1, len(factor_list), idx + 1)
        hist = plt.hist(X[factor], bins = 20, label = factor)
        plt.xlabel(factor, fontsize = 18)
        plt.ylabel("frequency", fontsize = 18)
    plt.tight_layout()
    return hist

factor1_list = ["x", "y", "z"]
hist_factor1 = get_hist(X, factor1_list)


## ダミー変数

In [None]:
def get_bar(X, dummy, dummy_full_list = []): # dummyは注目するダミー変数，dummy_full_listはそのダミー変数の順序を決めたリスト
    if len(dummy_full_list) == 0:
        dummy_full_list = X[dummy].unique() # dummy_full_listがない場合は適当に作成
    df_dummy = pd.DataFrame(index = dummy_full_list, columns = ["number", "ratio"])
    for entity in dummy_full_list:
        df_dummy.at[entity, "number"] = sum(X[dummy] == entity)
        df_dummy.at[entity, "ratio"] = sum(X[dummy] == entity) / len(X)
    df_dummy = df_dummy[df_dummy["number"] != 0] # 0個であるような要素は削除
    df_dummy = df_dummy[::-1] # 棒グラフプロットのため，インデックスの逆順に並べ直す
    
    # 棒グラフプロット
    position = np.arange(len(df_dummy))
    plt.barh(position, df_dummy["number"])
    plt.xlabel("frequency", fontsize = 18)
    plt.yticks(position, df_dummy.index)
    
    df_dummy = df_dummy[::-1] # テーブルとしては元の順番の方がよい
    return df_dummy

dummy1_full_list = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "AB"]
df_dummy1 = get_bar(X, "dummy1", dummy1_full_list)

# 連続変数 vs 連続変数

In [None]:
def get_scatter(X, factor1_list, factor2_list):
    df_factor1_factor2 = {}
    # factor1 vs factor2
    # factor1が行となり，factor2が列となるイメージ
    plt.figure(figsize = (4*len(factor2_list), 4*len(factor1_list)))
    for idx1, factor1 in enumerate(factor1_list):
        for idx2, factor2, in enumerate(factor2_list):
            df_factor1_factor2[factor2 + " vs " + factor1] = X[[factor2, factor1]]
            plt.subplot(len(factor1_list), len(factor2_list), idx1*len(factor2_list) + idx2 + 1)
            plt.scatter(X[factor2], X[factor1], alpha = 0.4)
            plt.xlabel(factor2, fontsize = 18)
            plt.ylabel(factor1 , fontsize = 18)
            plt.title(factor2 + " vs " + factor1, fontsize = 24)
    plt.tight_layout()
    return df_factor1_factor2

factor1_list = ["x", "y", "z"]
factor2_list = ["u", "v"]
df_factor1_factor2 = get_scatter(X, factor1_list, factor2_list)
df_factor1_factor2    

## ダミー変数 vs ダミー変数

In [None]:
def get_heat_map(X, dummy1, dummy2, dummy1_full_list, dummy2_full_list):
    if len(dummy1_full_list) == 0:
        dummy1_full_list = X[dummy1].unique()
    if len(dummy2_full_list) == 0:
        dummy2_full_list = X[dummy2].unique()

    df_dummy1_dummy2 = pd.DataFrame(index = dummy1_full_list, columns = dummy2_full_list)
    for entity1 in dummy1_full_list:
        for entity2 in dummy2_full_list:
            df_dummy1_dummy2.at[entity1, entity2] = sum( (X[dummy1] == entity1) & (X[dummy2] == entity2) )

    # データの入っていない列を削除
    for column in df_dummy1_dummy2.columns:
        if (df_dummy1_dummy2[column] == 0).all():
            df_dummy1_dummy2.drop(column, axis = 1, inplace = True)

    # データの入っていない行を削除
    for idx in df_dummy1_dummy2.index:
        if (df_dummy1_dummy2.loc[idx] == 0).all():
            df_dummy1_dummy2.drop(idx, axis = 0, inplace = True)
            
    plt.figure(figsize = (len(df_dummy1_dummy2.columns), 0.75*len(df_dummy1_dummy2.index)))
    sns.heatmap(df_dummy1_dummy2.astype(int), annot = True, fmt = "g", cmap = "Blues")
    plt.savefig("heat_map.jpg")
    
    return df_dummy1_dummy2

dummy1_full_list = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "AB"]
dummy2_full_list = ["a", "b", "c", "d", "e"]
df_dummy1_dummy2 = get_heat_map(X, "dummy1", "dummy2", dummy1_full_list, dummy2_full_list)
display(df_dummy1_dummy2)

## 連続変数 vs ダミー変数

In [None]:
def get_boxplot(X, factor_list, dummy, dummy_full_list):
    df_dummy = pd.DataFrame(index = dummy1_full_list, columns = ["number", "ratio"])
    for entity in dummy_full_list:
        df_dummy.at[entity, "number"] = sum(X[dummy] == entity)
        df_dummy.at[entity, "ratio"] = sum(X[dummy] == entity) / len(X)
    df_dummy = df_dummy[df_dummy["number"] != 0]

    df_dummy = df_dummy[::-1] # プロットのためダミー変数のテーブルを逆順に
    plt.figure(figsize = (4*len(factor_list), len(df_dummy)))
    
    
    df_factor_dummy = pd.DataFrame(index = X.index, columns = [dummy] + factor_list)
    
    for idx_factor, factor in enumerate(factor_list):
        #print(factor)
        plt.subplot(1, len(factor_list), idx_factor + 1)
        x_dummy = [[] for _ in range(len(df_dummy.index))]
        idx_start = 0
        for idx_dummy, entity in enumerate(df_dummy.index):
            x_dummy[idx_dummy] = X[X[dummy] == entity][factor]
            df_factor_dummy.loc[idx_start:idx_start + len(x_dummy[idx_dummy]), dummy] = entity
            df_factor_dummy.loc[idx_start:idx_start + len(x_dummy[idx_dummy]), factor] = X[X[dummy] == entity][factor]
            print(len(df_factor_dummy.loc[idx_start:idx_start + len(x_dummy[idx_dummy])-1, factor]), len(X[X[dummy] == entity]))
            display(X[X[dummy] == entity][factor])
            idx_start += len(x_dummy[idx_dummy]) + 1
        plt.boxplot(x_dummy, vert = False, labels = df_dummy.index)
        plt.xlabel(factor, fontsize = 14)
        plt.ylabel(dummy, fontsize = 14)
    df_dummy = df_dummy[::-1]
    
    return df_factor_dummy
    
factor_list = ["x", "y", "z"]
dummy_full_list = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "AB"]

df_factor_dummy = get_boxplot(X, factor_list, "dummy1", dummy_full_list)
display(df_factor_dummy)

In [None]:
list1 = ["a"]
list2 = ["b", "b"]
list1 + list2