In [1]:
import pickle
import pandas as pd
import numpy as np
import gc; gc.enable()
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import hstack, load_npz

In [2]:
'''
with open('all_features.pickle', 'rb') as handle:
    all_features = pickle.load(handle)
'''      
all_features = load_npz('all_features_tfidf_wordbin.npz')

In [3]:
train_y = pd.read_csv("regression_target.csv")
train_y = train_y['deal_probability'].values
train_len = train_y.shape[0]
print(train_len)

1503424


In [4]:
train_features = all_features[:train_len, :]
del all_features; gc.collect()

0

In [5]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, GridSearchCV, ParameterGrid, train_test_split
from sklearn.metrics import mean_squared_error, make_scorer
from copy import deepcopy as cp
from wordbatch.models import FM_FTRL

In [6]:
def clip_rmse(ground_truth, predictions):
    predictions = np.clip(predictions, 0., 1.)
    return mean_squared_error(ground_truth, predictions)**.5

clip_rmse_scorer = make_scorer(clip_rmse, greater_is_better=False)

In [7]:
def simple_train_val_split_eval(default_params, X, y, params, partial_sample=False):
    train_len = int(X.shape[0]*.7)
    tr_X, tr_y = X[:train_len, :], y[:train_len]
    val_X, val_y = X[train_len:], y[train_len:]
    
    if partial_sample is True:
        tr_ix = np.random.choice(tr_X.shape[0], tr_X.shape[0]//3, replace=False)
        val_ix = np.random.choice(val_X.shape[0], val_X.shape[0]//3, replace=False)
        tr_X, tr_y = tr_X[tr_ix,:], tr_y[tr_ix]
        val_X, val_y = val_X[val_ix,:], val_y[val_ix]
        print('Selected hyper-tune data size', tr_X.shape, val_X.shape)
        
    min_score = None
    best_param = None
    for param in list(ParameterGrid(params)):
        use_params = cp(default_params)
        use_params.update(param)
        print('Fitting params:\n', use_params)
        md = FM_FTRL(**use_params)
        md.fit(tr_X, tr_y)
        score = clip_rmse(val_y, md.predict(val_X))
        print(param, score)
        
        if min_score is None or score < min_score:
            best_param = param
            min_score = score
            
    print('Best param:', best_param, '\nscore:', min_score)

In [8]:
fmftrl_default_params = {
    'alpha': .01,
    'beta': .005,
    'L1': 0.001,
    'L2': 0.001,
    'D': train_features.shape[1],
    'D_fm': 300,
    'iters': 10,
    'seed': 719,
    'threads': 4,
    'verbose': 1
}

try_params = {
        'alpha': [.01, .001]
    }
simple_train_val_split_eval(fmftrl_default_params, train_features, train_y, try_params, partial_sample=False)

Fitting params:
 {'alpha': 0.01, 'beta': 0.005, 'L1': 0.001, 'L2': 0.001, 'D': 132042, 'D_fm': 300, 'iters': 10, 'seed': 719, 'threads': 4, 'verbose': 1}
Total e: 636327.004940277
Total e: 235132.94028387446
Total e: 175332.5894421764
Total e: 154481.12244606065
Total e: 141686.70929741857
Total e: 132557.69075082542
Total e: 125267.06776935364
Total e: 119167.5663631571
Total e: 114086.10989210679
Total e: 109717.29289420268
{'alpha': 0.01} 0.242059566817
Fitting params:
 {'alpha': 0.001, 'beta': 0.005, 'L1': 0.001, 'L2': 0.001, 'D': 132042, 'D_fm': 300, 'iters': 10, 'seed': 719, 'threads': 4, 'verbose': 1}
Total e: 458183.9442180746
Total e: 204431.8109849891
Total e: 159945.88394221477
Total e: 143303.8425522989
Total e: 132892.97345545428
Total e: 125030.5741896349
Total e: 118894.94277172902
Total e: 113809.93202282806
Total e: 109348.53509172547
Total e: 105442.78278070014
{'alpha': 0.001} 0.236284889637
Best param: {'alpha': 0.001} 
score: 0.236284889637


In [9]:
fmftrl_default_params = {
    'alpha': .01,
    'beta': .005,
    'L1': 0.001,
    'L2': 0.001,
    'D': train_features.shape[1],
    'D_fm': 300,
    'iters': 10,
    'seed': 719,
    'threads': 4,
    'verbose': 1
}

try_params = {
        'alpha': [.0001, .00001]
    }
simple_train_val_split_eval(fmftrl_default_params, train_features, train_y, try_params, partial_sample=False)

Fitting params:
 {'alpha': 0.0001, 'beta': 0.005, 'L1': 0.001, 'L2': 0.001, 'D': 132042, 'D_fm': 300, 'iters': 10, 'seed': 719, 'threads': 4, 'verbose': 1}
Total e: 575201.5954545944
Total e: 217213.0746967959
Total e: 166083.68505608293
Total e: 148567.08432699947
Total e: 137160.9652003984
Total e: 128751.38361592202
Total e: 122325.73253314437
Total e: 117243.10719233766
Total e: 112736.7382568866
Total e: 108774.81372474357
{'alpha': 0.0001} 0.237906822074
Fitting params:
 {'alpha': 1e-05, 'beta': 0.005, 'L1': 0.001, 'L2': 0.001, 'D': 132042, 'D_fm': 300, 'iters': 10, 'seed': 719, 'threads': 4, 'verbose': 1}
Total e: 393384.6140814069
Total e: 194937.89528567882
Total e: 154475.16403716442
Total e: 139292.78641304455
Total e: 129583.86787521164
Total e: 122273.93124948024
Total e: 116762.9975703149
Total e: 112058.26814575402
Total e: 108027.80971623589
Total e: 104452.73419492895
{'alpha': 1e-05} 0.236402466435
Best param: {'alpha': 1e-05} 
score: 0.236402466435


In [10]:
fmftrl_default_params = {
    'alpha': .001,
    'beta': .005,
    'L1': 0.001,
    'L2': 0.001,
    'D': train_features.shape[1],
    'D_fm': 300,
    'iters': 10,
    'seed': 719,
    'threads': 4,
    'verbose': 1
}
try_params = {
        'beta': [.005, .001]
    }
simple_train_val_split_eval(fmftrl_default_params, train_features, train_y, try_params, partial_sample=False)

Fitting params:
 {'alpha': 0.001, 'beta': 0.005, 'L1': 0.001, 'L2': 0.001, 'D': 132042, 'D_fm': 300, 'iters': 10, 'seed': 719, 'threads': 4, 'verbose': 1}
Total e: 458183.9442180746
Total e: 204431.8109849891
Total e: 159945.88394221477
Total e: 143303.8425522989
Total e: 132892.97345545428
Total e: 125030.5741896349
Total e: 118894.94277172902
Total e: 113809.93202282806
Total e: 109348.53509172547
Total e: 105442.78278070014
{'beta': 0.005} 0.236284889637
Fitting params:
 {'alpha': 0.001, 'beta': 0.001, 'L1': 0.001, 'L2': 0.001, 'D': 132042, 'D_fm': 300, 'iters': 10, 'seed': 719, 'threads': 4, 'verbose': 1}
Total e: 516337.33213770576
Total e: 210757.46772281724
Total e: 162959.07636385184
Total e: 145946.5412742552
Total e: 134972.4171264977
Total e: 126787.20994988024
Total e: 120429.86115444085
Total e: 115128.97660252305
Total e: 110597.82720057227
Total e: 106527.59171322595
{'beta': 0.001} 0.23694130465
Best param: {'beta': 0.005} 
score: 0.236284889637


In [8]:
fmftrl_default_params = {
    'alpha': .001,
    'beta': .005,
    'L1': 0.001,
    'L2': 0.001,
    'D': train_features.shape[1],
    'D_fm': 300,
    'iters': 10,
    'seed': 719,
    'threads': 4,
    'verbose': 1
}
try_params = {
        'L1': [.01, .001, .0001,],
        'L2': [.1, .01, .001,]
    }
simple_train_val_split_eval(fmftrl_default_params, train_features, train_y, try_params, partial_sample=False)

Fitting params:
 {'alpha': 0.001, 'beta': 0.005, 'L1': 0.01, 'L2': 0.1, 'D': 132042, 'D_fm': 300, 'iters': 10, 'seed': 719, 'threads': 4, 'verbose': 1}
Total e: 585943.5553826669
Total e: 221108.4109844882
Total e: 168785.6758551783
Total e: 150331.50321681742
Total e: 138481.93554321054
Total e: 129818.11890855429
Total e: 123191.50896594283
Total e: 117727.62333249913
Total e: 113238.80704869708
Total e: 108889.29237727553
{'L1': 0.01, 'L2': 0.1} 0.238470949659
Fitting params:
 {'alpha': 0.001, 'beta': 0.005, 'L1': 0.01, 'L2': 0.01, 'D': 132042, 'D_fm': 300, 'iters': 10, 'seed': 719, 'threads': 4, 'verbose': 1}
Total e: 583844.6480165233
Total e: 221075.8833842148
Total e: 168537.57221193553
Total e: 150209.55612816263
Total e: 138274.52868723293
Total e: 129753.91199681953
Total e: 123047.67309904358
Total e: 117668.41668663583
Total e: 113023.55245148033
Total e: 108784.02926847078
{'L1': 0.01, 'L2': 0.01} 0.237486869195
Fitting params:
 {'alpha': 0.001, 'beta': 0.005, 'L1': 0.01, 

In [9]:
fmftrl_default_params = {
    'alpha': .001,
    'beta': .005,
    'L1': 0.001,
    'L2': 0.001,
    'D': train_features.shape[1],
    'D_fm': 300,
    'iters': 10,
    'seed': 719,
    'threads': 4,
    'verbose': 1
}
try_params = {
        'D_fm': [100, 200, 300, 600],
        'iters': [10, 20, 40, 60]
    }
simple_train_val_split_eval(fmftrl_default_params, train_features, train_y, try_params, partial_sample=False)

Fitting params:
 {'alpha': 0.001, 'beta': 0.005, 'L1': 0.001, 'L2': 0.001, 'D': 132042, 'D_fm': 100, 'iters': 10, 'seed': 719, 'threads': 4, 'verbose': 1}
Total e: 383686.4933770968
Total e: 182114.35886245815
Total e: 156496.18344336632
Total e: 145423.1448625286
Total e: 138071.98041545044
Total e: 132380.21303639593
Total e: 127692.83976819918
Total e: 123574.01848601697
Total e: 120047.78101599037
Total e: 116755.17674463452
{'D_fm': 100, 'iters': 10} 0.235219455787
Fitting params:
 {'alpha': 0.001, 'beta': 0.005, 'L1': 0.001, 'L2': 0.001, 'D': 132042, 'D_fm': 100, 'iters': 20, 'seed': 719, 'threads': 4, 'verbose': 1}
Total e: 383686.4933770968
Total e: 182114.35886245815
Total e: 156496.18344336632
Total e: 145423.1448625286
Total e: 138071.98041545044
Total e: 132380.21303639593
Total e: 127692.83976819918
Total e: 123574.01848601697
Total e: 120047.78101599037
Total e: 116755.17674463452
Total e: 113707.56551620738
Total e: 110805.7369277009
Total e: 108180.05761175629
Total e: 

Total e: 56439.712197594024
Total e: 55751.024073776374
Total e: 55012.47971945498
Total e: 54335.90902143959
Total e: 53728.16052464922
Total e: 53107.54286517698
Total e: 52475.91369772498
Total e: 51909.86834737266
Total e: 51332.378539334066
Total e: 50708.182263945964
Total e: 50183.053396849486
Total e: 49672.184807908954
Total e: 49162.19668034911
Total e: 48646.13172789874
Total e: 48171.081057463605
{'D_fm': 200, 'iters': 60} 0.254522040416
Fitting params:
 {'alpha': 0.001, 'beta': 0.005, 'L1': 0.001, 'L2': 0.001, 'D': 132042, 'D_fm': 300, 'iters': 10, 'seed': 719, 'threads': 4, 'verbose': 1}
Total e: 458183.9442180746
Total e: 204431.8109849891
Total e: 159945.88394221477
Total e: 143303.8425522989
Total e: 132892.97345545428
Total e: 125030.5741896349
Total e: 118894.94277172902
Total e: 113809.93202282806
Total e: 109348.53509172547
Total e: 105442.78278070014
{'D_fm': 300, 'iters': 10} 0.236284889637
Fitting params:
 {'alpha': 0.001, 'beta': 0.005, 'L1': 0.001, 'L2': 0.001

Total e: 65764.8163699461
Total e: 64502.15991516608
Total e: 63313.74953228273
Total e: 62207.229188442565
Total e: 61058.9881644795
Total e: 60057.821648752535
Total e: 59019.918290068716
Total e: 58106.86585628787
Total e: 57246.614607342985
Total e: 56422.87718477387
Total e: 55561.46918776913
Total e: 54599.98714012368
Total e: 53845.494669290354
Total e: 52936.51792621544
Total e: 52322.607614424895
Total e: 51573.63402602079
Total e: 50878.03959755323
Total e: 50216.16451586206
Total e: 49601.12488648208
Total e: 48930.57163374158
Total e: 48274.80607409874
Total e: 47600.47217544711
Total e: 47079.065888869554
Total e: 46498.62746549279
Total e: 46034.594117500514
Total e: 45397.22482921833
Total e: 44939.65229738757
Total e: 44455.38410434633
Total e: 44016.51030381474
Total e: 43490.33776420381
Total e: 43140.42996019797
Total e: 42654.58855982141
{'D_fm': 600, 'iters': 60} 0.250009901156
Best param: {'D_fm': 100, 'iters': 10} 
score: 0.235219455787


In [14]:
fmftrl_default_params = {
    'alpha': .01,
    'beta': .0001,
    'L1': 0.01,
    'L2': 0.1,
    'D': train_features.shape[1],
    'D_fm': 200,
    'iters': 10,
    'seed': 719,
    'threads': 4,
    'verbose': 1
}
try_params = {
        'iters': [5, 10, 15, 20, 40, 60]
    }
simple_train_val_split_eval(fmftrl_default_params, train_features, train_y, try_params, partial_sample=False)

Fitting params:
 {'alpha': 0.01, 'beta': 0.0001, 'L1': 0.01, 'L2': 0.1, 'D': 290474, 'D_fm': 200, 'iters': 5, 'seed': 719, 'threads': 4, 'verbose': 1}
Total e: 163029.8667666383
Total e: 157443.96998070326
Total e: 155538.34910638587
Total e: 154218.02366526844
Total e: 153172.2825135037
{'iters': 5} 0.225058620947
Fitting params:
 {'alpha': 0.01, 'beta': 0.0001, 'L1': 0.01, 'L2': 0.1, 'D': 290474, 'D_fm': 200, 'iters': 10, 'seed': 719, 'threads': 4, 'verbose': 1}
Total e: 163029.8667666383
Total e: 157443.96998070326
Total e: 155538.34910638587
Total e: 154218.02366526844
Total e: 153172.2825135037
Total e: 152288.66449432465
Total e: 151513.26701948917
Total e: 150815.27689637433
Total e: 150176.71307262758
Total e: 149584.32416828081
{'iters': 10} 0.224836844616
Fitting params:
 {'alpha': 0.01, 'beta': 0.0001, 'L1': 0.01, 'L2': 0.1, 'D': 290474, 'D_fm': 200, 'iters': 15, 'seed': 719, 'threads': 4, 'verbose': 1}
Total e: 163029.8667666383
Total e: 157443.96998070326
Total e: 155538.3

In [1]:
train_len = int(train_features.shape[0]*.7)
tr_X, tr_y = train_features[:train_len, :], train_y[:train_len]
val_X, val_y = train_features[train_len:], train_y[train_len:]

NameError: name 'train_features' is not defined

In [14]:
fmftrl_default_params = {
    'alpha': .01,
    'beta': .0001,
    'L1': 0.001,
    'L2': 0.01,
    'D': train_features.shape[1],
    'D_fm': 100,
    'iters': 10,
    'seed': 719,
    'threads': 4,
    'verbose': 1
}

md = FM_FTRL(**fmftrl_default_params)
md.fit(tr_X, tr_y)

Total e: 164785.8085042756
Total e: 158601.4623680105
Total e: 156963.69157599422
Total e: 155895.15532400517
Total e: 155042.48267711815
Total e: 154363.56260225087
Total e: 153770.30537769498
Total e: 153242.55270082917
Total e: 152748.40941903414
Total e: 152300.87144869703


<wordbatch.models.fm_ftrl.FM_FTRL at 0x68e3910>

In [15]:
pd.DataFrame(md.predict(val_X), columns=['fmftrl_pred']).to_csv('fmftrl_cv_charwb24_val_pred.csv', index=False)

In [16]:
del train_features, tr_X, val_X; gc.collect()

6

In [25]:
all_features = load_npz('all_features_cv_charwb24.npz')
train_len = train_y.shape[0]
test_features = all_features[train_len:, :]
del all_features; gc.collect()

278

In [26]:
test_features.shape[0]

508438

In [27]:
test_pred = md.predict(test_features)

In [28]:
pd.DataFrame(test_pred, columns=['fmftrl_pred']).to_csv('fmftrl_cv_charwb24_test_pred.csv', index=False)

In [29]:
test_df = pd.read_csv("data/test.csv", usecols=['item_id'])

In [31]:
pd.DataFrame(np.clip(test_pred,0,1), 
             index=test_df.item_id,
             columns=['deal_probability']).to_csv('fmftrl_cv_charwb24_submission.csv')