In [94]:
import pandas as pd
import numpy as np

In [181]:
# 汇总所有样本的相关性计算结果
# 返回所有样本与feature0相关性最高的k个指标
def get_corr_top(cause_nums, top_k=5):
    # cause_nums：根因为n的样本列表
    # all_top_lists：[{'456': ['feature2',...,]}, ..., ]
    all_top_lists = []
    for num in cause_nums:
        file_name = str(num) + '.csv'
        file_path = './train/' + file_name
        one_top = get_top(file_path, top_k)
        all_top_lists.append({str(num): one_top})
    return all_top_lists
    
# 计算指标间的相关性
# 返回与feature0相关性最高的k个指标
def get_top(file, top_k):
    # file：文件路径
    # top_list：k个相关性最高的指标列表
    df = pd.read_csv(file)
    columns = df.columns.to_list()
    zero_columns = df.loc[:, (df==0).any()].columns.tolist()
    for zero_column in zero_columns:
        columns.remove(zero_column)
    corr_result = {}
    for column in columns:
        if column=='Date & Time' or column=='feature0':
            continue
        two_features = df[['feature0', column]].dropna()
        if two_features.shape[0] > 10:
            corr = two_features['feature0'].corr(two_features[column])
            corr_result.update({column: abs(corr)})
    return sorted(corr_result.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)[:top_k]

# 计算指标间的相关性
# 返回两个指标间的相关性
def corr_of_features(file):
    # file：文件路径
    # corr：
    df = pd.read_csv(file)
    columns = df.columns.to_list()
    zero_columns = df.loc[:, (df==0).any()].columns.tolist()
    for zero_column in zero_columns:
        columns.remove(zero_column)
    corr_result = {}
    corr = df['feature13'].corr(df['feature15'])
    return corr

# 计算get_corr_top返回结果的准确性
def get_precision(dict_res):
    both = 0
    only_13 = 0
    only_15 = 0
    for value in dict_res:
        value = [column[0] for column in list(value.values())[0]]
        if (('feature13' in value) and ('feature15' in value)):
            both = both + 1
        if (('feature13' in value) and ('feature15' not in value)):
            only_13 = only_13 + 1
        if (('feature13' not in value) and ('feature15' in value)):
            only_15 = only_15 + 1
    print('both: ', both, ' only_13: ', only_13, ' only_15: ', only_15)

In [170]:
cause1_nums = np.load('cause1_nums.npy')
cause1_nums = cause1_nums.tolist()

In [171]:
result_corr = get_corr_top(cause1_nums, top_k=5)

In [172]:
result_corr

[{'914': []},
 {'915': [('feature19', 0.3561213114284646),
   ('feature15', 0.3227677382757971),
   ('feature13', 0.2899404056925558),
   ('feature18', 0.1825063005412553),
   ('feature17', 0.14877168912473943)]},
 {'916': [('feature2', 0.8148756075282342),
   ('feature17', 0.8057762687004719),
   ('feature1', 0.7666901563004882),
   ('feature36_1', 0.556035100975663),
   ('feature69_4', 0.43588591150690914)]},
 {'917': [('feature13', 0.3402566002186528),
   ('feature19', 0.3080872782354349),
   ('feature15', 0.1776912038666281),
   ('feature14', 0.16411690366332068),
   ('feature61_1', 0.14665153186635804)]},
 {'918': [('feature28_6', 0.5301776096293961),
   ('feature61_6', 0.47399047156306134),
   ('feature60', 0.4496536896719257),
   ('feature69_0', 0.4408220646229884),
   ('feature28_5', 0.41116705836908757)]},
 {'919': [('feature13', 0.37241747757692356),
   ('feature14', 0.34902855396774085),
   ('feature3_4', 0.20575401215455674),
   ('feature17', nan),
   ('feature60', 0.338873

In [164]:
get_precision(result_corr)

both:  9  only_13:  4  only_15:  0


In [182]:
all_lists = []
for num in cause1_nums:
    file_name = str(num) + '.csv'
    file_path = './train/' + file_name
    one_top = corr_of_features(file_path)
    all_lists.append({str(num): one_top})

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


In [183]:
all_lists

[{'914': nan},
 {'915': 0.7405905956080938},
 {'916': 0.8610613499836443},
 {'917': 0.29847961503689807},
 {'918': 0.8645138613200151},
 {'919': 0.8636307387288341},
 {'920': 0.5998816040719429},
 {'921': 0.9960840224210954},
 {'922': 0.9530381193021555},
 {'924': 0.9841346734006807},
 {'925': 0.6728633328339401},
 {'926': 0.6728633328339401},
 {'927': 0.9904948132934203},
 {'928': 0.9224096409564726},
 {'929': 0.223553099795095},
 {'991': 1.0},
 {'993': 0.9866756404152663},
 {'1007': 0.9737282819017531},
 {'1021': nan},
 {'1034': 0.9999999999999999},
 {'1037': 0.775275738504124},
 {'1142': 0.5715962140066325},
 {'1171': nan},
 {'1250': 0.8198977864753487},
 {'1251': 0.7976489898354882},
 {'1252': 0.9197811243235949},
 {'1255': 0.5955383852316396},
 {'1257': 0.6552427193938293},
 {'1258': 0.9705957506421444},
 {'1380': 0.9838095536013971},
 {'1414': 1.0},
 {'1490': 0.664635907935292},
 {'1540': 0.9999999999999999},
 {'1547': -0.9999999999999999},
 {'1573': -0.5113765003036542},
 {'1781

# 测试

In [177]:
file_name = str(916) + '.csv'
file_path = './train/' + file_name
example = pd.read_csv(file_path)

In [178]:
example[['feature0', 'feature13', 'feature15']].dropna()

Unnamed: 0,feature0,feature13,feature15
8,0.23,0.0,0.0
9,7.41,24068.0,171.0
10,8.57,9696.0,104.0
11,11.03,12096.0,141.0
12,13.74,12528.0,152.0
13,18.0,17366.0,165.0
14,24.07,18951.0,187.0
15,29.37,24362.0,240.0
16,37.21,25296.0,229.0
17,45.71,28816.0,250.0


In [180]:
get_top(file_path, 15)

[('feature2', 0.8148756075282342),
 ('feature17', 0.8057762687004719),
 ('feature1', 0.7666901563004882),
 ('feature36_1', 0.556035100975663),
 ('feature69_4', 0.43588591150690914),
 ('feature61_1', 0.43095555804183405),
 ('feature16', 0.4024035892843376),
 ('feature69_5', 0.36998680491307756),
 ('feature36_5', 0.3662180159923495),
 ('feature61_4', 0.356433593052423),
 ('feature69_7', 0.35385345268992824),
 ('feature61_7', 0.3527266209560507),
 ('feature36_3', 0.35194451070393395),
 ('feature28_7', 0.34620097034099695),
 ('feature69_1', 0.32659117923235537)]

In [92]:
example[['feature0', 'feature1']].dropna()

Unnamed: 0,feature0,feature1
0,201.81,16.85
1,210.97,18.38
2,214.88,17.63
3,222.05,18.96
4,225.42,21.96
...,...,...
387,297.62,20.79
388,321.14,17.34
389,308.18,15.78
390,291.63,17.33


In [89]:
result = get_corr_top(cause1_nums)
get_precision(result)

both:  21  only_13:  8  only_15:  1


In [121]:
df = pd.read_csv(file_path)
columns = df.columns.to_list()
# for column in columns:
#     if column=='Date & Time' or column=='feature0':
#         continue
#     two_features = df[df['feature0', column]]
#     break
two_features = df[['feature0', 'feature3_2']].dropna()
two_features

Unnamed: 0,feature0,feature3_2
0,201.81,0.0
1,210.97,0.0
2,214.88,0.0
3,222.05,0.0
4,225.42,0.0
...,...,...
387,297.62,0.0
388,321.14,0.0
389,308.18,0.0
390,291.63,0.0


In [131]:
two_features.loc[:, (two_features==0).any()].columns.tolist()

['feature3_2']

In [140]:
result_dict = get_top(file_path)
result_dict

[('feature13', 0.3402566002186528),
 ('feature19', 0.3080872782354349),
 ('feature15', 0.1776912038666281),
 ('feature14', 0.16411690366332068),
 ('feature61_1', 0.14665153186635804)]

In [133]:
[34, 5, 6, 55].remove(34, 5)

TypeError: remove() takes exactly one argument (2 given)