In [3]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from datamarket import *

In [4]:
df = pd.read_csv("3D_spatial_network.txt", names=['id', 'f1', 'f2', 'result'], header=None)

In [2]:
def feature_partition_bits(f, n):
    f1min = df[f].min()
    f1max = df[f].max()
    step = (f1max - f1min)/n

    f_bits = []
    left = f1min
    right = f1min + step
    for i in range(n):
        f_bits.append((df[f] >= left) & (df[f] <= right))
        right += step
        left += step
    return f_bits

In [5]:
# test data
f1_bits = feature_partition_bits("f1",8)
f2_bits = feature_partition_bits("f2",8)
test = df[f1_bits[0] & f2_bits[0]].sample(n=2000, random_state=5)
testdata = agg_dataset()
testdata.load(test.copy(), ["f1","f2","result"], [], "")
testdata.compute_agg(True)
testdata.covariance

cov:s::f1                   8.300611
cov:s::f2                  56.656966
cov:s::result               9.060399
cov:c                       1.000000
cov:Q::f1,:f1              68.916825
cov:Q::f1,:f2             470.289354
cov:Q::f1,:result          76.085592
cov:Q::f2,:f2            3210.013646
cov:Q::f2,:result         513.567046
cov:Q::result,:result     241.062451
dtype: float64

In [35]:
f1_bits = feature_partition_bits("f1",8)
f2_bits = feature_partition_bits("f2",8)
sellers = []
partition = []
for i in range(len(f1_bits)):
    for j in range(len(f2_bits)):
        bit1 = f1_bits[i]
        bit2 = f2_bits[j]
        p = df[bit1 & bit2]
        if len(p) == 0:
            continue
        partition.append(p.copy())
        aggdata = agg_dataset()
        aggdata.load(p, ["f1","f2","result"], [], "")
        aggdata.compute_agg(True)
        sellers.append((aggdata, str(i) + "," + str(j)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['cov:c'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['cov:Q:' + tablename + ":" + attributes[i] + ","+ tablename + ":" + attributes[j]] = self.data[attributes[i]] * self.data[attributes[j]]


In [36]:
%%time
## our factorized
bought = set()
# bought.add(0)
# cur_cov = sellers[0][0].covariance.copy()
cur_cov = None
for _ in range(1):
    best_id = 0
    best_r2 = -100
    for i in range(len(sellers)):
        if i in bought:
            continue
        seller, name = sellers[i]
        sellerc = seller.covariance.copy()
        if cur_cov is not None:
            sellerc += cur_cov
        parameter = linear_regression(sellerc,[":f1",":f2"], ":result")
        curr2 = r2(testdata.covariance, [":f1",":f2"], ":result", parameter)
#         print(curr2,name)
        if curr2 > best_r2:
            best_r2, best_id = curr2, i
            
    print(best_id)
    bought.add(best_id)
    if cur_cov is not None:
        cur_cov += sellers[i][0].covariance
    else:
        cur_cov = sellers[i][0].covariance.copy()

0
CPU times: user 9.36 ms, sys: 250 µs, total: 9.61 ms
Wall time: 8.41 ms


In [86]:
%%time
# acquisition
current = partition[0].sample(n=10)
current["label"] = 1
bought = set()
for _ in range(1):
    best_id = 0
    best_r2 = -100
    for i in range(len(partition)):
        if i in bought:
            continue
        new = partition[i].copy()
        new["label"] = 0
        train = pd.concat([current, new])
        X_train = train[['id', 'f1', 'f2','result']]
        y_train = train['label']
        neigh = KNeighborsClassifier(n_neighbors=3)
        neigh.fit(X_train, y_train)
        curr2 = neigh.score(X_train, y_train)
        print(curr2, i)
        if curr2 > best_r2:
            best_r2, best_id = curr2, i

    print(best_id)
    bought.add(best_id)
    new = partition[best_id].copy()
    new["label"] = 0
    current = pd.concat([current, new])

0.9979083873666597 0
0.9987295134036336 1
0.998831229546517 2
0.9921259842519685 3
0.9985513544835579 4
0.9993167843315873 5
0.9993109151047409 6
0.9989326375711575 7
0.9990344694409578 8
0.9989254244573393 9
0.9983385944509054 10
0.9991264849755416 11
0.9992252266212133 12
0.9991500934897162 13
0.9990186457311089 14
0.9992229992229992 15
0.9992097984986171 16
0.9829545454545454 17
0.999155476733384 18
0.9993137994922117 19
0.9996563455788858 20
0.9997147177131772 21
0.9991472427515634 22
0.9993150215768203 23
0.9994637207057435 24
0.9986796936889358 25
0.9988795518207283 26
0.998903388529444 27
0.9991450068399452 28
0.9993948562783661 29
0.9993651196749412 30
0.9994915857440643 31
0.9980447534216815 32
0.9969731365872115 33
0.9994384546271339 34
0.9982638888888888 35
0.9989467524868344 36
0.9991928044280443 37
0.995937711577522 38
21
CPU times: user 9.72 s, sys: 15.8 ms, total: 9.74 s
Wall time: 9.72 s


In [70]:
# build train from bought
train = pd.concat([partition[i] for i in bought])
train

Unnamed: 0,id,f1,f2,result
3897,102587265,8.523908,56.645724,22.494156
3898,102587265,8.523499,56.646003,20.712217
4056,102587286,8.523766,56.646918,19.912918
4057,102587286,8.523638,56.646542,20.189207
4058,102587286,8.523524,56.646492,20.155700
...,...,...,...,...
434005,126527872,8.173375,56.630836,1.764131
434006,126527872,8.172764,56.630902,1.338380
434039,126527886,8.177638,56.622903,1.355110
434040,126527886,8.177341,56.622909,1.438666


In [38]:
time = 2*60

In [87]:
# model performance with autoML 
X_train = train[['id', 'f1', 'f2']]
y_train = train['result']
X_test = test[['id', 'f1', 'f2']]
y_test = test['result']

automl = autosklearn.regression.AutoSklearnRegressor(
# automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=int(time),
    per_run_time_limit=int(time/2),
    memory_limit=23072
#     ,metric=autosklearn.metrics.r2
)

automl.fit(X_train, y_train, X_test, y_test, dataset_name='buyer')



AutoSklearnRegressor(memory_limit=23072, per_run_time_limit=60,
                     time_left_for_this_task=120)

In [88]:
train_predictions = automl.predict(X_train)
print("Train R2 score:", sklearn.metrics.r2_score(y_train, train_predictions))
test_predictions = automl.predict(X_test)
print("Test R2 score:", sklearn.metrics.r2_score(y_test, test_predictions))

Train R2 score: 0.7734465131487456
Test R2 score: -0.23158420316571227


In [6]:
train_predictions = automl.predict(X_train)
print("Train R2 score:", sklearn.metrics.r2_score(y_train, train_predictions))
test_predictions = automl.predict(X_test)
print("Test R2 score:", sklearn.metrics.r2_score(y_test, test_predictions))

Train R2 score: 0.9947082885941484
Test R2 score: 0.9945875449332469
