In [10]:
import pandas as pd
import numpy as np
import pandas as pd

In [3]:
data = pd.DataFrame(np.random.randint(0, 5, size=(2500, 3)), columns=list('XYZ'))
data['sum'] = data.sum(axis=1)
data.head()

Unnamed: 0,X,Y,Z,sum
0,0,1,4,5
1,3,1,0,4
2,4,0,3,7
3,2,2,0,4
4,2,4,2,8


# pcalg
## G2検定
・G2検定はデータを離散化する必要がある  
・データの離散化にはpandasのcutが使える  
・離散化する際のカテゴリ数をデータ点数から決定する「スタージェスの公式」

In [44]:
data_size = 100
cpu_usage = np.random.uniform(0, 100, data_size)
df = pd.DataFrame({"cpu-usage": cpu_usage})
df.head()

Unnamed: 0,cpu-usage
0,62.675118
1,55.678797
2,37.825195
3,54.912529
4,3.699469


In [45]:
# スタージェスの公式でビン数を決定し，離散化する
n_bins = int(np.log2(data_size) + 1)
print("Number of bins: {}".format(n_bins))
discrete_data = pd.cut(df["cpu-usage"], n_bins, labels=np.arange(0, n_bins)) #retbins=True)
discrete_data

Number of bins: 7


0     4
1     3
2     2
3     3
4     0
     ..
95    1
96    2
97    5
98    2
99    4
Name: cpu-usage, Length: 100, dtype: category
Categories (7, int64): [0 < 1 < 2 < 3 < 4 < 5 < 6]

In [46]:
df["discrete-cpu-usage"] = discrete_data
df.head()

Unnamed: 0,cpu-usage,discrete-cpu-usage
0,62.675118,4
1,55.678797,3
2,37.825195,2
3,54.912529,3
4,3.699469,0


In [48]:
# pcalgを使ってみる
import pcalg
import networkx as nx
import numpy as np
from gsq.ci_tests import ci_test_dis
from gsq.gsq_testdata import dis_data

dm = np.array(dis_data).reshape((10000, 5))
(g, sep_set) = pcalg.estimate_skeleton(indep_test_func=ci_test_dis,
                                 data_matrix=dm,
                                 alpha=0.01,
                                 levels=[3,2,3,4,2])
g = pcalg.estimate_cpdag(skel_graph=g, sep_set=sep_set)
g_answer = nx.DiGraph()
g_answer.add_nodes_from([0, 1, 2, 3, 4])
g_answer.add_edges_from([(0, 2), (1, 2), (1, 3), (4, 3)])
print('Edges are:', g.edges(), end='')
if nx.is_isomorphic(g, g_answer):
    print(' => GOOD')
else:
    print(' => WRONG')
    print('True edges should be:', g_answer.edges())

Edges are: [(0, 2), (1, 2), (1, 3), (4, 3)] => GOOD


## Fisher-Z検定

In [26]:
from citestfz.ci_tests import ci_test_gauss

In [28]:
cm = np.corrcoef(dm.T)

(g, sep_set) = pcalg.estimate_skeleton(indep_test_func=ci_test_gauss,
                                 data_matrix=dm,
                                 alpha=0.01,
                                 corr_matrix=cm)
g = pcalg.estimate_cpdag(skel_graph=g, sep_set=sep_set)
g_answer = nx.DiGraph()
g_answer.add_nodes_from([0, 1, 2, 3, 4])
g_answer.add_edges_from([(0, 2), (1, 2), (1, 3), (4, 3)])
print('Edges are:', g.edges(), end='')
if nx.is_isomorphic(g, g_answer):
    print(' => GOOD')
else:
    print(' => WRONG')
    print('True edges should be:', g_answer.edges())

Edges are: [(0, 2), (1, 2), (1, 3), (4, 3)] => GOOD
