In [1]:
import numpy as np
from sklearn.svm import SVR, SVC, LinearSVR, LinearSVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.calibration import CalibratedClassifierCV
from data_generation import get_data, g_0, m_0
from dml_algorithm import dml_ate

In [5]:
rng = np.random.default_rng(seed=42)

In [32]:
N = 500
y_data, d_data, x_data = get_data(N, rng)
y_train, y_test, d_train, d_test, x_train, x_test = train_test_split(y_data, d_data, x_data, test_size=0.2, random_state=42)

In [28]:
model_g0, model_g1 = LinearSVR(dual='auto', max_iter=2500), LinearSVR(dual='auto', max_iter=2500)
model_g0_nl, model_g1_nl = SVR(), SVR()
model_m = CalibratedClassifierCV(estimator=LinearSVC(dual='auto', max_iter=2500)) #, n_jobs=-1
model_m_nl = SVC(probability=True, random_state=42)

In [33]:
%%time
model_m_nl.fit(x_train, d_train)
print(model_m_nl.predict_proba(x_test)[:20,1])
model_m.fit(x_train, d_train)
print(model_m.predict_proba(x_test)[:20,1])
print(m_0(x_test[:20]))

[0.20616222 0.48191053 0.80500452 0.14827234 0.35374326 0.15227984
 0.45363401 0.75366221 0.20784173 0.07876629 0.07526102 0.82241337
 0.18787036 0.82666651 0.46930685 0.10085445 0.5968767  0.61820791
 0.08937497 0.6272459 ]
[0.26064935 0.52283606 0.8706997  0.26024082 0.22946942 0.13067742
 0.41072264 0.61396534 0.15104785 0.13644403 0.07639933 0.6142506
 0.33894458 0.73694633 0.42284718 0.08460405 0.82201806 0.40374624
 0.07297841 0.34627699]
[0.16465136 0.29591761 0.99321981 0.22196489 0.25091268 0.09449871
 0.52666606 0.8384573  0.11335196 0.08531746 0.0487229  0.67460123
 0.21047555 0.52894896 0.39774866 0.05527185 0.93581175 0.5346337
 0.0431292  0.93009875]
CPU times: total: 109 ms
Wall time: 128 ms


In [30]:
%%time
model_g0_nl.fit(x_train[d_train==0], y_train[d_train==0])
print(model_g0_nl.predict(x_test[:20]))
model_g0.fit(x_train[d_train==0], y_train[d_train==0])
print(model_g0.predict(x_test[:20]))
print(g_0(0, x_test[:20]))

[10.47716628  7.73409231  6.46016792  5.95428982  8.8291173   7.1627878
  3.40736756  0.24695816  6.50644919  5.80975088  4.52308587  3.75535651
  7.72381709  1.73496596  2.5745025   6.07583292  3.07167522  6.46187477
  3.09845883  5.15547289]
[10.2159737   5.37702912  6.25284053  6.12665864  8.7665275   7.61665537
  4.1994701  -0.20306386  4.23214008  6.35352659  4.94887086  3.61963059
  5.72675918  2.066882    2.99060531  5.75120088  3.33227346  6.91585736
  4.26560899  5.35120698]
[10.20339689  7.44669562  5.49039872  5.832386    8.70385726  7.07663241
  3.43980833 -0.40490245  6.93449924  5.79251758  4.5309165   3.60693093
  7.69295896  1.76317445  2.65161581  6.20005277  3.08596869  6.38886038
  3.27747993  5.03170534]
CPU times: total: 3.56 s
Wall time: 3.68 s


In [31]:
%%time
model_g1_nl.fit(x_train[d_train==1], y_train[d_train==1])
print(model_g1_nl.predict(x_test[:20]))
model_g1.fit(x_train[d_train==1], y_train[d_train==1])
print(model_g1.predict(x_test[:20]))
print(g_0(1, x_test[:20]))

[10.89607559  6.73233185  5.05222714  7.11811649  6.54557223  7.47402865
  1.67399861  0.17162879  5.46133247  5.8237022   5.97104698  2.98411551
  7.88774833  2.71552733  1.3120564   6.42046094  4.81142543  6.17313181
  3.0982366   4.3211792 ]
[10.91822266  4.91519779  6.21893339  7.28624021  7.45619255  8.07313221
  2.37523483  0.05689454  2.93245697  6.32390484  6.56635289  3.21225859
  5.95278023  2.95750777  1.79578565  5.93824639  5.25682568  6.31390585
  3.90603817  3.86599109]
[10.70538616  6.78424164  4.5524114   7.22046549  6.87038265  7.30067778
  1.6432539   0.22121259  6.16873487  5.43925362  5.94519746  3.1095561
  7.49192411  2.57835408  1.30874581  6.49050901  4.87694642  5.9446495
  2.94501352  4.05768644]
CPU times: total: 2.23 s
Wall time: 2.33 s


In [52]:
N = 250
y_data, d_data, x_data = get_data(N, rng)
y_train, y_test, d_train, d_test, x_train, x_test = train_test_split(y_data, d_data, x_data, test_size=0.2, random_state=42)
model_m = CalibratedClassifierCV(estimator=LinearSVC(dual='auto', max_iter=2500))
model_m_2 = CalibratedClassifierCV(estimator=LinearSVC(dual='auto', max_iter=2500), method='isotonic')

In [53]:
%%time
model_m.fit(x_train, d_train)
print(model_m.predict_proba(x_test)[:20,1])
model_m_2.fit(x_train, d_train)
print(model_m_2.predict_proba(x_test)[:20,1])
print(m_0(x_test[:20]))

[0.75782366 0.1081392  0.4380588  0.13226565 0.22477492 0.32252746
 0.52142833 0.55220563 0.3364116  0.61823012 0.76432348 0.29846205
 0.46542094 0.15033277 0.65025305 0.4393908  0.85164284 0.57002081
 0.63447986 0.04004093]
[0.93333333 0.         0.4512605  0.         0.09203475 0.30238095
 0.4512605  0.4912605  0.33952381 0.67239993 0.93333333 0.27507003
 0.4912605  0.05714286 0.72466527 0.4512605  0.96012949 0.54226408
 0.62380952 0.        ]
[0.8485524  0.09058125 0.30348166 0.02835181 0.13868599 0.39836533
 0.46815584 0.49492956 0.23402002 0.82336364 0.70270092 0.35112246
 0.48414538 0.03198229 0.78706381 0.32661086 0.83445733 0.65141855
 0.6412689  0.00732812]
CPU times: total: 109 ms
Wall time: 115 ms


In [None]:
C, max_iter

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100]
}

In [255]:
%%time
N = 16000
y_data, d_data, x_data = get_data(N, rng)
model_g0, model_g1 = LinearSVR(C=5, dual='auto', max_iter=5000, random_state=42), LinearSVR(C=5, dual='auto', max_iter=5000, random_state=42)
model_g = [model_g0, model_g1]
model_m = CalibratedClassifierCV(estimator=LinearSVC(C=5, dual='auto', max_iter=5000, random_state=42)) #, n_jobs=-1
dml_ate(y_data, d_data, x_data, model_g, model_m, K=5, classical=True, inference=True, alpha=0.05)

CPU times: total: 3.02 s
Wall time: 3 s


(array([0.51908536, 0.55552273, 0.45258778]),
 5.451014995427961,
 array([0.43462256, 0.60354816]))

In [46]:
%%time
model_g0, model_g1 = LinearSVR(dual='auto', max_iter=2500, random_state=42), LinearSVR(dual='auto', max_iter=2500, random_state=42)
model_g = [model_g0, model_g1]
model_m = SVC(probability=True, random_state=42)
dml_ate(y_data, d_data, x_data, model_g, model_m, K=5, classical=True, inference=True, alpha=0.05)

CPU times: total: 984 ms
Wall time: 986 ms


(array([0.5423967 , 0.64183653, 0.10269771]),
 4.224612415082437,
 array([0.28055733, 0.80423608]))

In [193]:
def svm_cv(y_data, d_data, x_data, cv=5):
    model_g = LinearSVR(dual='auto', max_iter=5000, random_state=42)
    #model_m = CalibratedClassifierCV(estimator=LinearSVC(dual='auto', max_iter=5000, random_state=42))
    model_m = SVC(kernel='linear', probability=True, random_state=42)
    
    param_grid = {
        'C': [0.1, 1, 3, 10, 5]
    }

    grid_search_g = GridSearchCV(estimator=model_g, param_grid=param_grid, cv=cv, n_jobs=-1,
                                 scoring='neg_mean_squared_error')
    grid_search_m = GridSearchCV(estimator=model_m, param_grid=param_grid, cv=cv, n_jobs=-1,
                                 scoring='neg_brier_score')

    svm_params_dict = {}
    for d in [0, 1]:
        grid_search_g.fit(X=x_data[d_data == d], y=y_data[d_data == d])
        svm_params_dict[f'g{d}'] = grid_search_g.best_params_
    grid_search_m.fit(X=x_data, y=d_data)
    svm_params_dict['m'] = grid_search_m.best_params_

    return svm_params_dict

In [198]:
for N in [250, 500, 1000]:#, 2000, 4000, 8000, 16000]:
    y_data, d_data, x_data = get_data(N, rng)
    svm_params_dict = svm_cv(y_data, d_data, x_data)
    print(svm_params_dict)

{'g0': {'C': 10}, 'g1': {'C': 5}, 'm': {'C': 10}}
{'g0': {'C': 5}, 'g1': {'C': 3}, 'm': {'C': 10}}




{'g0': {'C': 10}, 'g1': {'C': 10}, 'm': {'C': 1}}


In [None]:
choose C=5!