In [355]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from selectinf.Simulation.spline import cubic_spline, b_spline
from selectinf.Simulation.H1.nonlinear_H1_helpers import *
from selectinf.RealDataHelpers.rdhelpers import *
from sklearn.decomposition import PCA

In [356]:
fpw = pd.read_csv("fpw.csv", index_col=0)

In [357]:
fpw

Unnamed: 0,day,dep_delay,distance,plane_age,seats,temp,dewp,humid,wind_dir,wind_speed,wind_gust,precip,pressure,visib,sched_dep_hour,sched_arr_hour
0,1,28.0,187,18.0,20,42.0,30.0,62.18,360.0,9.20624,10.594357,0.00,1017.8,10.00,23,0
1,1,-12.0,1576,21.0,200,42.0,30.0,62.18,360.0,9.20624,10.594357,0.00,1017.8,10.00,23,4
2,1,6.0,1041,18.0,20,41.0,30.0,64.63,250.0,9.20624,10.594357,0.00,1015.7,10.00,8,11
3,1,-7.0,1182,24.0,145,41.0,30.0,64.63,250.0,9.20624,10.594357,0.00,1015.7,10.00,8,10
4,1,-4.0,1029,22.0,145,41.0,30.0,64.63,250.0,9.20624,10.594357,0.00,1015.7,10.00,8,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8435,27,-3.0,187,11.0,200,42.0,41.0,96.21,0.0,0.00000,0.000000,0.00,1019.2,1.50,5,6
8436,27,2.0,1089,1.0,185,42.0,41.0,96.21,0.0,0.00000,0.000000,0.00,1019.2,1.50,5,8
8437,16,240.0,1089,7.0,189,26.0,23.0,88.23,330.0,4.60312,5.297178,0.01,1020.9,1.25,5,8
8438,27,-5.0,209,17.0,95,42.0,39.0,89.01,160.0,4.60312,5.297178,0.00,1023.2,10.00,23,0


In [358]:
fpw["precip"].unique()

array([0.    , 0.01  , 0.02  , 0.03  , 0.0001, 0.04  ])

In [359]:
Y = fpw["dep_delay"]
X = fpw.drop(["dep_delay", 'day', "wind_gust"], axis=1)

In [376]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.95,
                                                    random_state=42)
# Standardize
x_train = (x_train - x_train.mean()) / x_train.std()
x_test = (x_test - x_train.mean()) / x_train.std()

# Construct splines

In [377]:
def get_splines(x_nl, x_l, nknots, degree, intercept):
    bs = b_spline(data_nl=np.array(x_nl), data_l=np.array(x_l), 
                  nknots=nknots, degree=degree, intercept=intercept)
    bs.construct_splines(use_quantiles=True, equally_spaced=False, center=False)
    design_train = bs.get_spline_data()
    design_train *= np.sqrt(design_train.shape[0])
    design_train[:, 0] = 1
    # Returning group labels with 0 meaning the intercept (if applicable)
    groups = bs.get_groups()

    return design_train, groups


In [378]:
linear = list(x_train.columns[x_train.nunique() < 40])

In [379]:
x_train.nunique()

distance           56
plane_age          31
seats              23
temp               39
dewp               48
humid             182
wind_dir           34
wind_speed         21
precip              6
pressure          174
visib              17
sched_dep_hour     18
sched_arr_hour     20
dtype: int64

In [380]:
x_train_nl = x_train.drop(linear, axis=1)
x_train_l = x_train[linear]
x_test_nl = x_test.drop(linear, axis=1)
x_test_l = x_test[linear]

In [381]:
design_train, groups_train = (
        get_splines(x_train_nl, x_train_l, nknots=6, degree=2, intercept=True))
design_test, groups_test = (
        get_splines(x_test_nl, x_test_l, nknots=6, degree=2, intercept=True))

Equally spaced quantile knots used.
Equally spaced quantile knots used.


In [382]:
# Steps:
# 1. Transform training fully for naive and MLE
# 2. Transform 90% of training for data splitting's selection
# 3. Use 10% of remaining to get the transformed PCs for data splitting's inference
# 4. 

In [383]:
design_train

array([[  1.        ,  -0.16992383,   1.20894571, ...,   9.67651294,
        -18.73071828, -18.03147071],
       [  1.        ,  -1.52320332,  -0.58029123, ...,   9.67651294,
        -31.10175656, -21.82376284],
       [  1.        ,  -0.55177734,  -0.1985028 , ...,   9.67651294,
         30.75343487, -59.74668405],
       ...,
       [  1.        ,  -0.12219291,   1.11424005, ..., -24.9026521 ,
          1.88767887,   4.72228201],
       [  1.        ,  -1.52320332,  -0.58029123, ...,  -4.15515308,
          6.01135829,  12.30686626],
       [  1.        ,  -0.2286495 ,   1.31091077, ...,   9.67651294,
        -14.60703885, -14.23917859]])

In [384]:
design_train.T @ design_train

array([[ 3.67000000e+02, -1.81122388e+02,  2.30346530e+02,
        -2.27841228e+02, -6.39262000e+01, -2.70706142e+02,
         1.44802282e+02, -1.75079678e+02,  6.41689396e+01,
         0.00000000e+00, -6.11066753e-13,  3.04112291e-12,
        -3.97903932e-13, -2.27373675e-13, -2.11386464e-13,
         7.38964445e-13,  1.51345603e-12,  4.58300065e-13],
       [-1.81122388e+02,  3.67000000e+02,  8.88178420e-15,
         1.48840631e+02,  3.65639193e+01,  1.45958587e+02,
        -8.44870264e+01,  7.33157831e+01, -3.28115888e+01,
         3.51491808e+01, -2.42976417e+03,  4.74999302e+02,
        -2.75377927e+02, -5.31769435e+01,  4.25329840e+01,
        -8.00465776e+01,  1.23246727e+02, -9.18033929e+01],
       [ 2.30346530e+02,  8.88178420e-15,  3.67000000e+02,
        -1.16333009e+02, -4.55006294e+01, -1.55077021e+02,
         8.33730363e+01, -1.23068516e+02,  1.22208946e+01,
         1.17203341e+03, -4.47720660e+03,  1.09370504e+02,
        -5.37250290e+02, -2.52840538e+02, -7.19885106e

In [385]:
design_train

array([[  1.        ,  -0.16992383,   1.20894571, ...,   9.67651294,
        -18.73071828, -18.03147071],
       [  1.        ,  -1.52320332,  -0.58029123, ...,   9.67651294,
        -31.10175656, -21.82376284],
       [  1.        ,  -0.55177734,  -0.1985028 , ...,   9.67651294,
         30.75343487, -59.74668405],
       ...,
       [  1.        ,  -0.12219291,   1.11424005, ..., -24.9026521 ,
          1.88767887,   4.72228201],
       [  1.        ,  -1.52320332,  -0.58029123, ...,  -4.15515308,
          6.01135829,  12.30686626],
       [  1.        ,  -0.2286495 ,   1.31091077, ...,   9.67651294,
        -14.60703885, -14.23917859]])

In [438]:
const = group_lasso.gaussian
result_naive, nonzero_naive, selected_groups_naive \
    = naive_inference_real_data(X=design_train, Y=np.array(y_train), 
                                raw_data=np.array(x_train), 
                                groups=groups_train, const=const,
                                n_features=x_train.shape[1], 
                                intercept=True, weight_frac=5, level=0.9,
                                mode="weakhierarchy", root_n_scaled=True)

Selected groups: [0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
Naive Selected Groups: 11


In [413]:
pd.DataFrame(result_naive)[pd.DataFrame(result_naive)['pval'] < 0.1]

Unnamed: 0,i,j,CI_l,CI_u,beta_hat,pval
20,3,6,0.967577,18.233321,9.600449,0.066663
23,3,11,-18.491357,-2.815682,-10.65352,0.024991
25,4,6,1.093364,19.707564,10.400464,0.065352
28,4,9,-17.55974,-2.941687,-10.250713,0.020732
30,4,11,-15.172237,-0.39069,-7.781463,0.082514
35,5,9,-16.624708,-2.635061,-9.629885,0.023186
46,7,10,-17.717474,-2.876981,-10.297228,0.022108
52,9,10,6.110416,21.010464,13.56044,0.002685


In [439]:
result_MLE, nonzero_MLE, selected_groups_MLE\
    = MLE_inference_real_data(X=design_train, Y=np.array(y_train), 
                              raw_data=np.array(x_train), groups=groups_train, 
                              n_features=x_train.shape[1], 
                              intercept=True, weight_frac=5, level=0.9, 
                              mode="weakhierarchy", 
                              root_n_scaled=True, proportion=0.9)

Selected groups: [0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
MLE Selected Groups: 11


In [440]:
pd.DataFrame(result_MLE)[pd.DataFrame(result_MLE)['pval'] < 0.1]

Unnamed: 0,i,j,CI_l,CI_u,beta_hat,pval
14,1,7,-16.861629,-0.800609,-8.831119,0.070476
16,1,9,2.09633,17.609547,9.852939,0.036672
20,2,3,0.050667,16.277308,8.163988,0.097899
24,2,7,-16.934544,-1.002255,-8.9684,0.064055
37,3,11,-29.132414,-6.191193,-17.661803,0.01132
38,3,12,-30.135588,-6.616663,-18.376126,0.010159
42,4,8,1.766295,42.332865,22.04958,0.073762
43,4,9,-39.958886,-11.654092,-25.806489,0.002706
44,4,10,-49.477189,-0.672676,-25.074933,0.09099
45,4,11,-30.608469,-7.095236,-18.851853,0.008351


# Validate significant interactions on a holdout set

In [441]:
def validate(x_test, design_test, y_test, nonzero, selected_groups, 
             groups = None, n_features=None, intercept=True, mode="allpairs", level=0.9):
    X_E = design_test[:, nonzero]
    active_flag = np.zeros(np.unique(groups).shape[0])
    active_flag[selected_groups] = 1.
    raw_data=np.array(x_test)

    if intercept:
        active_vars_flag = active_flag[1:]
    else:
        active_vars_flag = active_flag

    data_interaction = {}
    task_idx = []
    for i in range(n_features):
        for j in range(i + 1, n_features):
            if mode == "allpairs":
                task_idx.append((i, j))
                data_interaction[(i, j)] = raw_data[:, i] * raw_data[:, j]
            elif mode == 'weakhierarchy':
                if active_vars_flag[i] or active_vars_flag[j]:
                    task_idx.append((i, j))
                    data_interaction[(i, j)] = raw_data[:, i] * raw_data[:, j]
            elif mode == 'stronghierarchy':
                if active_vars_flag[i] and active_vars_flag[j]:
                    task_idx.append((i, j))
                    data_interaction[(i, j)] = raw_data[:, i] * raw_data[:, j]

    result_dict = interaction_t_tests_all(X_E, y_test, n_features,
                                          active_vars_flag, data_interaction,
                                          level=level, mode=mode)

    return result_dict

In [442]:
result_naive_validate = validate(x_test, design_test, y_test, nonzero_naive, 
                                 selected_groups_naive, groups = groups_train, 
                                 n_features=x_train.shape[1],
                                 intercept=True, mode="weakhierarchy", level=0.9)

In [443]:
pd.DataFrame(result_naive_validate)

Unnamed: 0,i,j,CI_l,CI_u,beta_hat,pval
0,0,3,-0.000013,0.000076,0.000032,0.237806
1,0,4,-0.000036,0.000066,0.000015,0.622868
2,0,5,-0.000035,0.000008,-0.000014,0.292419
3,0,6,-0.000002,0.000010,0.000004,0.281871
4,0,7,-0.000105,0.000131,0.000013,0.856069
...,...,...,...,...,...,...
70,9,11,-0.060874,-0.015162,-0.038018,0.006212
71,9,12,-0.046554,-0.007463,-0.027008,0.023013
72,10,11,-0.123986,0.026534,-0.048726,0.286838
73,10,12,-0.137401,-0.009214,-0.073307,0.059894


In [444]:
result_MLE_validate = validate(x_test, design_test, y_test, nonzero_MLE, 
                                 selected_groups_MLE, groups = groups_train, 
                                 n_features=x_train.shape[1],
                                 intercept=True, mode="weakhierarchy", level=0.9)

In [445]:
pd.DataFrame(result_MLE_validate)

Unnamed: 0,i,j,CI_l,CI_u,beta_hat,pval
0,0,3,-0.000013,0.000076,0.000032,0.237806
1,0,4,-0.000036,0.000066,0.000015,0.622868
2,0,5,-0.000035,0.000008,-0.000014,0.292419
3,0,6,-0.000002,0.000010,0.000004,0.281871
4,0,7,-0.000105,0.000131,0.000013,0.856069
...,...,...,...,...,...,...
70,9,11,-0.060874,-0.015162,-0.038018,0.006212
71,9,12,-0.046554,-0.007463,-0.027008,0.023013
72,10,11,-0.123986,0.026534,-0.048726,0.286838
73,10,12,-0.137401,-0.009214,-0.073307,0.059894


In [446]:
naive_df = pd.DataFrame(result_naive)
naive_significant = \
    [(naive_df["i"][k], naive_df["j"][k]) for k in range(naive_df.shape[0])
     if naive_df["pval"][k] < 0.1]
naive_insig = \
    [(naive_df["i"][k], naive_df["j"][k]) for k in range(naive_df.shape[0])
     if naive_df["pval"][k] >= 0.1]

In [447]:
naive_test_df = pd.DataFrame(result_naive_validate)
naive_test_significant = \
    [(naive_test_df["i"][k], naive_test_df["j"][k]) for k in range(naive_test_df.shape[0])
     if naive_test_df["pval"][k] < 0.1]
naive_test_insig = \
    [(naive_test_df["i"][k], naive_test_df["j"][k]) for k in range(naive_test_df.shape[0])
     if naive_test_df["pval"][k] >= 0.1]

In [448]:
naive_all = [(naive_test_df["i"][k], naive_test_df["j"][k]) 
             for k in range(naive_test_df.shape[0])]

In [449]:
len(set(naive_test_significant).intersection(set(naive_significant))) / len(set(naive_significant))

0.4166666666666667

In [450]:
len(set(naive_test_insig).intersection(set(naive_insig))) / len(set(naive_test_insig))

0.8292682926829268

In [451]:
MLE_df = pd.DataFrame(result_MLE)
MLE_significant = \
    [(MLE_df["i"][k], MLE_df["j"][k]) for k in range(MLE_df.shape[0])
     if MLE_df["pval"][k] < 0.1]
MLE_insig = \
    [(MLE_df["i"][k], MLE_df["j"][k]) for k in range(MLE_df.shape[0])
     if MLE_df["pval"][k] >= 0.1]

In [452]:
len(MLE_significant)

18

In [453]:
MLE_test_df = pd.DataFrame(result_MLE_validate)
MLE_test_significant = \
    [(MLE_test_df["i"][k], MLE_test_df["j"][k]) for k in range(MLE_test_df.shape[0])
     if MLE_test_df["pval"][k] < 0.1]
MLE_test_insig = \
    [(MLE_test_df["i"][k], MLE_test_df["j"][k]) for k in range(MLE_test_df.shape[0])
     if MLE_test_df["pval"][k] >= 0.1]

In [454]:
MLE_all = [(MLE_test_df["i"][k], MLE_test_df["j"][k]) 
             for k in range(MLE_test_df.shape[0])]

In [455]:
alpha = 0.1
naive_train = [(naive_df["pval"][k] < alpha) for k in range(naive_df.shape[0])]
naive_test = [(naive_test_df["pval"][k] < alpha) for k in range(naive_test_df.shape[0])]

In [456]:
from sklearn.metrics import confusion_matrix
# tn, fp, 
# fn, tp
confusion_matrix(naive_test, naive_train)

array([[34,  7],
       [29,  5]])

In [457]:
from sklearn.metrics import f1_score
f1_score(naive_test, naive_train)

0.21739130434782608

In [458]:
MLE_train = [(MLE_df["pval"][k] < alpha) for k in range(MLE_df.shape[0])]
MLE_test = [(MLE_test_df["pval"][k] < alpha) for k in range(MLE_test_df.shape[0])]

In [459]:
from sklearn.metrics import confusion_matrix
# tn, fp, 
# fn, tp
confusion_matrix(MLE_test, MLE_train)

array([[35,  6],
       [22, 12]])

In [460]:
from sklearn.metrics import f1_score
f1_score(MLE_test, MLE_train)

0.46153846153846156

In [461]:
len(set(MLE_test_significant).intersection(set(MLE_significant)))# / len(MLE_test_significant)

12

In [437]:
len(set(MLE_test_insig).intersection(set(MLE_insig)))# / len(set(MLE_test_insig))

15