In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy.sparse import csc_matrix
import gc
from sklearn.cluster import KMeans, MiniBatchKMeans
from MulticoreTSNE import MulticoreTSNE as TSNE

In [3]:
def click_counts(df):
    n_samples = df.shape[0]
    
    # count clicks by ['ip', 'device', 'os']
    click_counts_per_hour = df.groupby(
        by=['ip', 'device', 'os', 'day', 'hour']
    ).size()
    
    # to dataframe
    click_counts_per_hour = pd.DataFrame(
        click_counts_per_hour, columns=['count']
    ).reset_index()

    # label encode: (day, hour)
    click_counts_per_hour['period'] = (
        list(zip(click_counts_per_hour.day, click_counts_per_hour.hour))
    )
    click_counts_per_hour['period'] = np.unique(
        click_counts_per_hour['period'], return_inverse=True
    )[1]
    
    click_features = csc_matrix(
        (click_counts_per_hour['count'].values,
         (np.arange(click_counts_per_hour.shape[0]), 
          click_counts_per_hour.period.values))
    )

    n_periods = click_counts_per_hour.period.max() + 1
    click_feature_df = pd.DataFrame(
        click_features.toarray(),
        columns=['period_{}'.format(i) for i in range(n_periods)]
    )
    
    click_feature_df = (
        pd.concat(
            [click_counts_per_hour, click_feature_df], axis=1
        )
        .groupby(by=['ip', 'device', 'os'])
        .sum()
        .drop(['day', 'hour', 'count', 'period'], axis=1)
    )
    
    # df.shape[0] != click_feature_df.shape[0]
    return click_feature_df

In [4]:
train_df = pd.read_hdf('../input/X_train_add_supplement.h5', 'table')
test_df  = pd.read_hdf('../input/X_test_add_supplement.h5',  'table')

df = train_df.append(test_df)

del test_df
gc.collect()

22

In [5]:
%%time
click_feature_df = click_counts(df)

CPU times: user 6min 43s, sys: 7min 14s, total: 13min 57s
Wall time: 7min 24s


In [None]:
%%time
kmeans = KMeans(
    n_clusters=10, 
    max_iter=100000,
    verbose=True,
    n_jobs=-1,
    random_state=0)
kmeans.fit(click_feature_df)

Initialization complete
start iteration
done sorting
end inner loop
Initialization complete
Iteration 0, inertia 5868536171.948034
start iteration
done sorting
end inner loop
start iteration
done sorting
end inner loop
Initialization complete
Iteration 1, inertia 5614545932.335568
start iteration
done sorting
end inner loop
Iteration 0, inertia 5829025062.427956
start iteration
done sorting
start iteration
done sorting
end inner loop
end inner loop
Initialization complete
Iteration 2, inertia 5574000152.867366
start iteration
done sorting
end inner loop
start iteration
done sorting
end inner loop
Iteration 1, inertia 5601959667.870991
start iteration
done sorting
Iteration 0, inertia 6228595340.588758
start iteration
done sorting
end inner loop
end inner loop
Initialization complete
Iteration 3, inertia 5559330578.528966
start iteration
done sorting
end inner loop
Iteration 0, inertia 5845643678.270263
start iteration
done sorting
end inner loop
start iteration
done sorting
end inner l

done sorting
end inner loop
Iteration 13, inertia 5551121364.262697
center shift 4.758146e-03 within tolerance 6.626910e-03
Iteration 10, inertia 5466795397.535383
start iteration
done sorting
end inner loop
Iteration 5, inertia 5560074354.087405
start iteration
done sorting
end inner loop
Iteration 11, inertia 5644502206.908624
start iteration
done sorting
end inner loop
Iteration 6, inertia 5525894386.535979
start iteration
done sorting
end inner loop
Iteration 7, inertia 5511374067.535176
start iteration
done sorting
Iteration 12, inertia 5516593503.390648
start iteration
done sorting
end inner loop
end inner loop
Iteration 8, inertia 5441801532.863988
start iteration
done sorting
end inner loop
Iteration 10, inertia 5612342151.1117935
start iteration
done sorting
end inner loop
Iteration 9, inertia 5506518578.963912
start iteration
done sorting
end inner loop
Iteration 11, inertia 5465764369.223319
start iteration
done sorting
end inner loop
Iteration 6, inertia 5556023212.394148
s

start iteration
done sorting
end inner loop
end inner loop
Iteration 18, inertia 5438975245.156374
start iteration
done sorting
Iteration 22, inertia 5642526012.254283
start iteration
done sorting
end inner loop
Iteration 21, inertia 5461437912.857111
start iteration
done sorting
end inner loop
end inner loop
Iteration 19, inertia 5502880865.678667
start iteration
done sorting
end inner loop
Iteration 16, inertia 5531155323.215569
start iteration
done sorting
end inner loop
Iteration 17, inertia 5489766700.506715
start iteration
done sorting
end inner loop
Iteration 18, inertia 5498889020.773729
start iteration
done sorting
Iteration 21, inertia 5608160729.105337
start iteration
done sorting
end inner loop
end inner loop
Iteration 23, inertia 5513747469.797394
start iteration
done sorting
Iteration 23, inertia 5642483425.033106
start iteration
done sorting
end inner loop
end inner loop
Iteration 19, inertia 5438908477.39936
start iteration
done sorting
Iteration 22, inertia 5461299420.

start iteration
done sorting
end inner loop
end inner loop
Iteration 33, inertia 5508417550.450853
start iteration
done sorting
end inner loop
Iteration 29, inertia 5438536046.463227
start iteration
done sorting
Iteration 32, inertia 5460438144.50208
start iteration
done sorting
end inner loop
end inner loop
Iteration 28, inertia 5469913746.170865
start iteration
done sorting
end inner loop
Iteration 30, inertia 5501398084.630771
start iteration
done sorting
end inner loop
Iteration 27, inertia 5530810504.197149
start iteration
done sorting
end inner loop
Iteration 32, inertia 5607655893.629024
start iteration
done sorting
end inner loop
Iteration 28, inertia 5465809193.866609
start iteration
done sorting
Iteration 34, inertia 5642351651.184437
start iteration
done sorting
Iteration 34, inertia 5507829120.880835
start iteration
done sorting
end inner loop
end inner loop
end inner loop
Iteration 33, inertia 5460401626.202297
start iteration
done sorting
end inner loop
Iteration 30, iner

Iteration 43, inertia 5460220281.50052
start iteration
done sorting
Iteration 42, inertia 5607614396.997421
start iteration
done sorting
end inner loop
end inner loop
Iteration 40, inertia 5438463700.963422
start iteration
done sorting
Iteration 38, inertia 5461452069.799151
start iteration
done sorting
end inner loop
Iteration 41, inertia 5501324251.198929
start iteration
done sorting
end inner loop
end inner loop
Iteration 44, inertia 5499181695.174289
start iteration
done sorting
end inner loop
Iteration 39, inertia 5463647848.054809
start iteration
done sorting
end inner loop
Iteration 45, inertia 5642338587.492973
start iteration
done sorting
end inner loop
Iteration 38, inertia 5530597114.179378
start iteration
done sorting
Iteration 43, inertia 5607614140.459546
start iteration
done sorting
end inner loop
end inner loop
Iteration 44, inertia 5460210642.550228
start iteration
done sorting
end inner loop
Iteration 39, inertia 5460806003.053927
start iteration
done sorting
Iteratio

start iteration
done sorting
end inner loop
Iteration 56, inertia 5642310523.862844
start iteration
done sorting
Iteration 49, inertia 5530576900.859292
start iteration
done sorting
end inner loop
end inner loop
Iteration 55, inertia 5460184359.307229
center shift 5.663580e-02 within tolerance 6.626910e-03
Iteration 52, inertia 5438422550.875961
start iteration
done sorting
Iteration 56, inertia 5483913392.133927
start iteration
done sorting
end inner loop
Iteration 50, inertia 5458905549.10889
start iteration
done sorting
end inner loop
end inner loop
Iteration 51, inertia 5459173401.56727
start iteration
done sorting
end inner loop
Iteration 54, inertia 5501300392.385991
start iteration
done sorting
Iteration 50, inertia 5530576589.256201
start iteration
done sorting
end inner loop
Iteration 53, inertia 5438421120.7804785
start iteration
done sorting
end inner loop
Iteration 57, inertia 5642302248.075179
start iteration
done sorting
end inner loop
Iteration 51, inertia 5458717148.361

Process ForkPoolWorker-28:
Process ForkPoolWorker-15:
Process ForkPoolWorker-27:
Process ForkPoolWorker-20:
Process ForkPoolWorker-22:
Process ForkPoolWorker-19:
Process ForkPoolWorker-16:
Process ForkPoolWorker-17:
Process ForkPoolWorker-24:
Process ForkPoolWorker-25:
Process ForkPoolWorker-29:
Process ForkPoolWorker-26:
Process ForkPoolWorker-12:
Process ForkPoolWorker-21:
Process ForkPoolWorker-31:
Process ForkPoolWorker-32:
Process ForkPoolWorker-30:
Process ForkPoolWorker-13:
Process ForkPoolWorker-1:
Process ForkPoolWorker-11:
Process ForkPoolWorker-14:
Process ForkPoolWorker-18:
Process ForkPoolWorker-4:
Process ForkPoolWorker-23:
Process ForkPoolWorker-5:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (mo

  File "/home/g8da_tetsuya/miniconda3/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/g8da_tetsuya/miniconda3/lib/python3.6/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
    racquire()
  File "/home/g8da_tetsuya/miniconda3/lib/python3.6/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
    racquire()
  File "/home/g8da_tetsuya/miniconda3/lib/python3.6/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
    racquire()
  File "/home/g8da_tetsuya/miniconda3/lib/python3.6/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
    racquire()
  File "/home/g8da_tetsuya/miniconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
KeyboardInterrupt
  File "/home/g8da_tetsuya/miniconda3/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
KeyboardInterrupt
  File "/home/g8da_tetsuya/miniconda3/lib/python3.6/site-packages/sklearn/external