In [18]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from selectinf.Simulation.spline import cubic_spline, b_spline
from selectinf.Simulation.H1.nonlinear_H1_helpers import *
from selectinf.RealDataHelpers.rdhelpers import *
from sklearn.decomposition import PCA

In [19]:
fpw = pd.read_csv("fpw.csv", index_col=0)

In [20]:
fpw

Unnamed: 0,day,dep_delay,distance,plane_age,seats,temp,dewp,humid,wind_dir,wind_speed,wind_gust,precip,pressure,visib,sched_dep_hour,sched_arr_hour
0,1,28.0,187,18.0,20,42.0,30.0,62.18,360.0,9.20624,10.594357,0.00,1017.8,10.00,23,0
1,1,-12.0,1576,21.0,200,42.0,30.0,62.18,360.0,9.20624,10.594357,0.00,1017.8,10.00,23,4
2,1,6.0,1041,18.0,20,41.0,30.0,64.63,250.0,9.20624,10.594357,0.00,1015.7,10.00,8,11
3,1,-7.0,1182,24.0,145,41.0,30.0,64.63,250.0,9.20624,10.594357,0.00,1015.7,10.00,8,10
4,1,-4.0,1029,22.0,145,41.0,30.0,64.63,250.0,9.20624,10.594357,0.00,1015.7,10.00,8,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8435,27,-3.0,187,11.0,200,42.0,41.0,96.21,0.0,0.00000,0.000000,0.00,1019.2,1.50,5,6
8436,27,2.0,1089,1.0,185,42.0,41.0,96.21,0.0,0.00000,0.000000,0.00,1019.2,1.50,5,8
8437,16,240.0,1089,7.0,189,26.0,23.0,88.23,330.0,4.60312,5.297178,0.01,1020.9,1.25,5,8
8438,27,-5.0,209,17.0,95,42.0,39.0,89.01,160.0,4.60312,5.297178,0.00,1023.2,10.00,23,0


In [33]:
fpw["precip"].unique()

array([0.    , 0.01  , 0.02  , 0.03  , 0.0001, 0.04  ])

In [73]:
Y = fpw["dep_delay"]
X = fpw.drop(["dep_delay", 'day', "wind_gust"], axis=1)

In [161]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.95,
                                                    random_state=48105)

# Construct splines

In [162]:
def get_splines(x_nl, x_l, nknots, degree, intercept):
    bs = b_spline(data_nl=np.array(x_nl), data_l=np.array(x_l), 
                  nknots=nknots, degree=degree, intercept=intercept)
    bs.construct_splines(use_quantiles=True, equally_spaced=False, center=False)
    design_train = bs.get_spline_data()
    design_train *= np.sqrt(design_train.shape[0])
    design_train[:, 0] = 1
    # Returning group labels with 0 meaning the intercept (if applicable)
    groups = bs.get_groups()

    return design_train, groups


In [163]:
x_train.nunique()
linear = list(x_train.columns[x_train.nunique() < 40])

In [164]:
x_train_nl = x_train.drop(linear, axis=1)
x_train_l = x_train[linear]
x_test_nl = x_test.drop(linear, axis=1)
x_test_l = x_test[linear]

In [165]:
design_train, groups_train = (
        get_splines(x_train_nl, x_train_l, nknots=6, degree=2, intercept=True))
design_test, groups_test = (
        get_splines(x_test_nl, x_test_l, nknots=6, degree=2, intercept=True))

Equally spaced quantile knots used.
Equally spaced quantile knots used.


In [166]:
# Steps:
# 1. Transform training fully for naive and MLE
# 2. Transform 90% of training for data splitting's selection
# 3. Use 10% of remaining to get the transformed PCs for data splitting's inference
# 4. 

In [167]:
design_train

array([[ 1.00000000e+00, -2.82382270e-01, -1.48607054e+00, ...,
         1.91572441e+02,  3.44830393e+02,  3.83144881e+02],
       [ 1.00000000e+00, -3.42889987e-02, -8.64822114e-01, ...,
         1.91572441e+02,  3.63987637e+02,  4.21459369e+02],
       [ 1.00000000e+00, -3.42889987e-02, -8.64822114e-01, ...,
         1.91572441e+02,  3.63987637e+02,  4.21459369e+02],
       ...,
       [ 1.00000000e+00, -3.89797575e-01, -1.66885261e+00, ...,
         1.91572441e+02,  2.68201417e+02,  3.06515905e+02],
       [ 1.00000000e+00, -4.59079262e-02, -2.64939519e-01, ...,
         1.91572441e+02,  2.68201417e+02,  3.25673149e+02],
       [ 1.00000000e+00, -1.12802153e+00,  5.93502449e-01, ...,
         2.87358661e+01,  2.10729685e+02,  2.68201417e+02]])

In [168]:
design_train.T @ design_train

array([[ 3.67000000e+02, -1.49938522e+02, -2.33783054e+02,
        -2.26743972e+02,  5.18235806e+01, -2.83006308e+02,
         8.23416159e+01, -1.72277663e+02,  6.28134187e+01,
         9.29892627e+04,  1.25449297e+06,  2.57783708e+05,
         1.47913081e+06,  7.85049988e+04,  1.11954934e+01,
         6.01677311e+04,  9.29317909e+04,  1.01954853e+05],
       [-1.49938522e+02,  3.67000000e+02,  4.66293670e-15,
         7.23137294e+01, -3.21406281e+01,  1.00349896e+02,
        -2.46329941e+01,  5.76178230e+01, -2.39668142e+01,
        -4.04820620e+04, -7.25755138e+05, -1.04259554e+05,
        -6.03296104e+05, -2.83158113e+04, -3.88672624e+00,
        -2.50936209e+04, -3.61776670e+04, -4.18720024e+04],
       [-2.33783054e+02,  4.66293670e-15,  3.67000000e+02,
         1.59543581e+02, -2.83567808e+01,  2.03261335e+02,
        -5.05196139e+01,  1.18236029e+02, -6.97937733e+00,
        -6.71350067e+04, -3.39591129e+05, -1.64094760e+05,
        -9.41442633e+05, -5.06960275e+04, -8.71644001e

In [169]:
design_train

array([[ 1.00000000e+00, -2.82382270e-01, -1.48607054e+00, ...,
         1.91572441e+02,  3.44830393e+02,  3.83144881e+02],
       [ 1.00000000e+00, -3.42889987e-02, -8.64822114e-01, ...,
         1.91572441e+02,  3.63987637e+02,  4.21459369e+02],
       [ 1.00000000e+00, -3.42889987e-02, -8.64822114e-01, ...,
         1.91572441e+02,  3.63987637e+02,  4.21459369e+02],
       ...,
       [ 1.00000000e+00, -3.89797575e-01, -1.66885261e+00, ...,
         1.91572441e+02,  2.68201417e+02,  3.06515905e+02],
       [ 1.00000000e+00, -4.59079262e-02, -2.64939519e-01, ...,
         1.91572441e+02,  2.68201417e+02,  3.25673149e+02],
       [ 1.00000000e+00, -1.12802153e+00,  5.93502449e-01, ...,
         2.87358661e+01,  2.10729685e+02,  2.68201417e+02]])

In [192]:
const = group_lasso.gaussian
result_naive, nonzero_naive, selected_groups_naive \
    = naive_inference_real_data(X=design_train, Y=np.array(y_train), 
                                raw_data=np.array(x_train), 
                                groups=groups_train, const=const,
                                n_features=x_train.shape[1], 
                                intercept=True, weight_frac=1, level=0.9,
                                mode="weakhierarchy", root_n_scaled=False)

Selected groups: [0, 4, 5, 6, 7, 8, 9, 11, 12, 13]
Naive Selected Groups: 10


In [193]:
pd.DataFrame(result_naive)[pd.DataFrame(result_naive)['pval'] < 0.1]

Unnamed: 0,i,j,CI_l,CI_u,beta_hat,pval
1,0,4,5.998946e-06,0.000385,0.000195,0.088874
2,0,5,7.855609e-08,0.00015,7.5e-05,0.098763
5,0,8,0.1483645,1.673687,0.911026,0.048841
10,1,4,-0.08540047,-0.003719,-0.04456,0.071967
33,3,10,0.02967577,0.420132,0.224904,0.057454
67,9,11,0.04022288,0.203263,0.121743,0.013783


In [194]:
result_MLE, nonzero_MLE, selected_groups_MLE\
    = MLE_inference_real_data(X=design_train, Y=np.array(y_train), 
                              raw_data=np.array(x_train), groups=groups_train, 
                              n_features=x_train.shape[1], 
                              intercept=True, weight_frac=1, level=0.9, 
                              mode="weakhierarchy", 
                              root_n_scaled=False, proportion=0.9)

Selected groups: [0, 4, 5, 6, 7, 8, 9, 11, 12, 13]
MLE Selected Groups: 10


In [195]:
pd.DataFrame(result_MLE)[pd.DataFrame(result_MLE)['pval'] < 0.1]

Unnamed: 0,i,j,CI_l,CI_u,beta_hat,pval
1,0,4,0.000096,0.001125,0.000611,5.111640e-02
2,0,5,0.000134,0.000569,0.000352,7.892693e-03
3,0,6,-0.000908,-0.000801,-0.000855,6.431335e-151
4,0,7,-0.012682,-0.010208,-0.011445,2.648483e-52
5,0,8,-6.270124,-2.781589,-4.525857,1.973154e-05
...,...,...,...,...,...,...
67,9,11,-9.759472,-9.552653,-9.656063,0.000000e+00
68,9,12,-6.285534,-6.120508,-6.203021,0.000000e+00
69,10,11,-9.899123,-9.167095,-9.533109,0.000000e+00
70,10,12,-12.948571,-12.304919,-12.626745,0.000000e+00


# Validate significant interactions on a holdout set

In [196]:
def validate(x_test, design_test, y_test, nonzero, selected_groups, 
             groups = None, n_features=None, intercept=True, mode="allpairs", level=0.9):
    X_E = design_test[:, nonzero]
    active_flag = np.zeros(np.unique(groups).shape[0])
    active_flag[selected_groups] = 1.
    raw_data=np.array(x_test)

    if intercept:
        active_vars_flag = active_flag[1:]
    else:
        active_vars_flag = active_flag

    data_interaction = {}
    task_idx = []
    for i in range(n_features):
        for j in range(i + 1, n_features):
            if mode == "allpairs":
                task_idx.append((i, j))
                data_interaction[(i, j)] = raw_data[:, i] * raw_data[:, j]
            elif mode == 'weakhierarchy':
                if active_vars_flag[i] or active_vars_flag[j]:
                    task_idx.append((i, j))
                    data_interaction[(i, j)] = raw_data[:, i] * raw_data[:, j]
            elif mode == 'stronghierarchy':
                if active_vars_flag[i] and active_vars_flag[j]:
                    task_idx.append((i, j))
                    data_interaction[(i, j)] = raw_data[:, i] * raw_data[:, j]

    result_dict = interaction_t_tests_all(X_E, y_test, n_features,
                                          active_vars_flag, data_interaction,
                                          level=level, mode=mode)

    return result_dict

In [197]:
result_naive_validate = validate(x_test, design_test, y_test, nonzero_naive, 
                                 selected_groups_naive, groups = groups_train, 
                                 n_features=x_train.shape[1],
                                 intercept=True, mode="weakhierarchy", level=0.9)

In [198]:
pd.DataFrame(result_naive_validate)

Unnamed: 0,i,j,CI_l,CI_u,beta_hat,pval
0,0,3,-0.000016,0.000078,0.000031,0.285025
1,0,4,-0.000039,0.000070,0.000016,0.637015
2,0,5,-0.000040,0.000006,-0.000017,0.218457
3,0,6,-0.000002,0.000010,0.000004,0.295011
4,0,7,-0.000125,0.000121,-0.000002,0.977187
...,...,...,...,...,...,...
67,9,11,-0.060116,-0.012799,-0.036458,0.011245
68,9,12,-0.045121,-0.004087,-0.024604,0.048525
69,10,11,-0.128064,0.029340,-0.049362,0.302172
70,10,12,-0.147773,-0.013460,-0.080617,0.048292


In [199]:
result_MLE_validate = validate(x_test, design_test, y_test, nonzero_MLE, 
                                 selected_groups_MLE, groups = groups_train, 
                                 n_features=x_train.shape[1],
                                 intercept=True, mode="weakhierarchy", level=0.9)

In [200]:
pd.DataFrame(result_MLE_validate)

Unnamed: 0,i,j,CI_l,CI_u,beta_hat,pval
0,0,3,-0.000016,0.000078,0.000031,0.285025
1,0,4,-0.000039,0.000070,0.000016,0.637015
2,0,5,-0.000040,0.000006,-0.000017,0.218457
3,0,6,-0.000002,0.000010,0.000004,0.295011
4,0,7,-0.000125,0.000121,-0.000002,0.977187
...,...,...,...,...,...,...
67,9,11,-0.060116,-0.012799,-0.036458,0.011245
68,9,12,-0.045121,-0.004087,-0.024604,0.048525
69,10,11,-0.128064,0.029340,-0.049362,0.302172
70,10,12,-0.147773,-0.013460,-0.080617,0.048292


In [201]:
naive_df = pd.DataFrame(result_naive)
naive_significant = \
    [(naive_df["i"][k], naive_df["j"][k]) for k in range(naive_df.shape[0])
     if naive_df["pval"][k] < 0.1]
naive_insig = \
    [(naive_df["i"][k], naive_df["j"][k]) for k in range(naive_df.shape[0])
     if naive_df["pval"][k] >= 0.1]

In [202]:
naive_test_df = pd.DataFrame(result_naive_validate)
naive_test_significant = \
    [(naive_test_df["i"][k], naive_test_df["j"][k]) for k in range(naive_test_df.shape[0])
     if naive_test_df["pval"][k] < 0.1]
naive_test_insig = \
    [(naive_test_df["i"][k], naive_test_df["j"][k]) for k in range(naive_test_df.shape[0])
     if naive_test_df["pval"][k] >= 0.1]

In [203]:
naive_all = [(naive_test_df["i"][k], naive_test_df["j"][k]) 
             for k in range(naive_test_df.shape[0])]

In [204]:
len(set(naive_test_significant).intersection(set(naive_significant))) / len(set(naive_significant))

0.3333333333333333

In [205]:
len(set(naive_test_insig).intersection(set(naive_insig))) / len(set(naive_test_insig))

0.9

In [206]:
MLE_df = pd.DataFrame(result_MLE)
MLE_significant = \
    [(MLE_df["i"][k], MLE_df["j"][k]) for k in range(MLE_df.shape[0])
     if MLE_df["pval"][k] < 0.1]
MLE_insig = \
    [(MLE_df["i"][k], MLE_df["j"][k]) for k in range(MLE_df.shape[0])
     if MLE_df["pval"][k] >= 0.1]

In [207]:
MLE_test_df = pd.DataFrame(result_MLE_validate)
MLE_test_significant = \
    [(MLE_test_df["i"][k], MLE_test_df["j"][k]) for k in range(MLE_test_df.shape[0])
     if MLE_test_df["pval"][k] < 0.1]
MLE_test_insig = \
    [(MLE_test_df["i"][k], MLE_test_df["j"][k]) for k in range(MLE_test_df.shape[0])
     if MLE_test_df["pval"][k] >= 0.1]

In [208]:
MLE_all = [(MLE_test_df["i"][k], MLE_test_df["j"][k]) 
             for k in range(MLE_test_df.shape[0])]

In [209]:
alpha = 0.1
naive_train = [(naive_df["pval"][k] < alpha) for k in range(naive_df.shape[0])]
naive_test = [(naive_test_df["pval"][k] < alpha) for k in range(naive_test_df.shape[0])]

In [210]:
from sklearn.metrics import confusion_matrix
# tn, fp, 
# fn, tp
confusion_matrix(naive_test, naive_train)

array([[36,  4],
       [30,  2]])

In [216]:
from sklearn.metrics import f1_score
f1_score(naive_test, naive_train)

0.10526315789473684

In [211]:
MLE_train = [(MLE_df["pval"][k] < alpha) for k in range(MLE_df.shape[0])]
MLE_test = [(MLE_test_df["pval"][k] < alpha) for k in range(MLE_test_df.shape[0])]

In [212]:
from sklearn.metrics import confusion_matrix
# tn, fp, 
# fn, tp
confusion_matrix(MLE_test, MLE_train)

array([[ 4, 36],
       [ 2, 30]])

In [214]:
from sklearn.metrics import f1_score
f1_score(MLE_test, MLE_train)

0.6122448979591837

In [213]:
len(set(MLE_test_significant).intersection(set(MLE_significant)))# / len(MLE_test_significant)

30

In [64]:
len(set(MLE_test_insig).intersection(set(MLE_insig)))# / len(set(MLE_test_insig))

106