In [1]:
CE_PAIRS_PATH = '../data/interim/cause_effect_pairs'


In [28]:
import numpy as np
import pandas as pd
import ppscore as pps

ce_pair_df = pd.read_csv(CE_PAIRS_PATH + '/info.csv', sep=',', index_col='name')
ce_pair_df.columns = ['x', 'y', 'source', 'ground truth']
print(ce_pair_df.head(10).to_markdown())
ce_pair_list = list(ce_pair_df.to_records())

x2y_pps_list, y2x_pps_list = [], []
for _ce_pair in ce_pair_list:

    _pair_path = CE_PAIRS_PATH + f'/{_ce_pair[0]}.txt'
    _data = pd.read_csv(_pair_path, sep='\s+', header=None)

    # only examine the single-factor cases
    if _data.shape[1] != 2:
        x2y_pps, y2x_pps = np.NaN, np.NaN
    else:
        _data.columns = ['x', 'y']
        x2y_pps = pps.score(_data, 'x', 'y')['ppscore']
        y2x_pps = pps.score(_data, 'y', 'x')['ppscore']

    x2y_pps_list.append(x2y_pps)
    y2x_pps_list.append(y2x_pps)


ce_pair_df['x2y pps'] = x2y_pps_list
ce_pair_df['y2x pps'] = y2x_pps_list



| name     | x         | y              | source   | ground truth   |
|:---------|:----------|:---------------|:---------|:---------------|
| pair0001 | Altitude  | Temperature    | DWD      | ->             |
| pair0002 | Altitude  | Precipitation  | DWD      | ->             |
| pair0003 | Longitude | Temperature    | DWD      | ->             |
| pair0004 | Altitude  | Sunshine hours | DWD      | ->             |
| pair0005 | Age       | Length         | Abalone  | ->             |
| pair0006 | Age       | Shell weight   | Abalone  | ->             |
| pair0007 | Age       | Diameter       | Abalone  | ->             |
| pair0008 | Age       | Height         | Abalone  | ->             |
| pair0009 | Age       | Whole weight   | Abalone  | ->             |
| pair0010 | Age       | Shucked weight | Abalone  | ->             |


In [35]:
ce_pair_df = ce_pair_df.dropna(axis=0)[['x', 'y', 'x2y pps', 'y2x pps', 'ground truth']]
print(ce_pair_df.to_markdown())


| name     | x                                | y                                    |    x2y pps |    y2x pps | ground truth   |
|:---------|:---------------------------------|:-------------------------------------|-----------:|-----------:|:---------------|
| pair0001 | Altitude                         | Temperature                          | 0.311821   | 0.480005   | ->             |
| pair0002 | Altitude                         | Precipitation                        | 0.162979   | 0.0868262  | ->             |
| pair0003 | Longitude                        | Temperature                          | 0          | 0.0786545  | ->             |
| pair0004 | Altitude                         | Sunshine hours                       | 0          | 0          | ->             |
| pair0005 | Age                              | Length                               | 0.327973   | 0.167912   | ->             |
| pair0006 | Age                              | Shell weight                         | 0.3

In [41]:
ce_pair_df['pps diff'] = ce_pair_df['x2y pps'] - ce_pair_df['y2x pps']
_ce_pps = pps.score(ce_pair_df, x='pps diff', y='ground truth')['ppscore']
print(f'The predictive power score of '
      f'(diff(x2y pps, y2x pps) -> causal effect ground truth) is {_ce_pps}')


The predictive power score of (diff(x2y pps, y2x pps) -> causal effect ground truth) is 0


