In [77]:
import numpy as np
import pandas as pd
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

x_threshold = 0.025
y_threshold = 0.0125

# area 10km by 10 km is divided into grids of size 20x40
grid_size = 1.0
x_step = 0.5
y_step = 0.25

In [116]:
def prepare_data(df):
    """
    Feature engineering
    """

    minute = df.time % 60
    df['hour'] = df['time'].div(60).map(int)
    df.drop(['time'], axis=1, inplace=True)
    df['weekday'] = df['hour'].div(24).map(int)
    df['month'] = df['weekday'].div(30).map(int)
    df['year'] = (df['weekday'].div(365).map(int) + 1) * 10.0
    df['hour'] = ((df['hour'] % 24 + 1) + minute.div(60.0)) * 4.0
    df['weekday'] = (df['weekday'] % 7 + 1) * 3.0
    df['month'] = (df['month'] % 12 + 1) * 2.0
    df['accuracy'] = np.log10(df['accuracy']) * 10.0

    return df


def process_one_cell(df_train, df_test, th, x_min, y_min, x_max, y_max, method='rf'):
    """   
    Classification inside one grid cell.
    """

    x_min_th = x_min - x_threshold
    y_min_th = y_min - y_threshold
    x_max_th = x_max + x_threshold
    y_max_th = y_max + y_threshold

    # Working on df_train, getting few extra points outside this grid
    df_cell_train = df_train[(df_train['x'] >= x_min_th)
                             & (df_train['x'] <= x_max_th)
                             & (df_train['y'] >= y_min_th)
                             & (df_train['y'] <= y_max_th)]

    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    # Feature engineering on x and y for test
    df_cell_train.loc[:, 'x'] *= 500.0
    df_cell_train.loc[:, 'y'] *= 1000.0

    df_cell_train = df_cell_train.loc[mask]

    # Working on df_test
    df_cell_test = df_test[(df_test['x'] >= x_min_th) & (df_test['x'] <= x_max_th) &
                           (df_test['y'] >= y_min_th) & (df_test['y'] <= y_max_th)]
    row_ids = df_cell_test.index
    # Feature engineering on x and y for test
    df_cell_test.loc[:, 'x'] *= 500.0
    df_cell_test.loc[:, 'y'] *= 1000.0

    # Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id'], axis=1).values
    X_test = df_cell_test.drop.values if mode=='test' else df_cell_test.drop(['place_id'], axis=1).values

    # Applying the classifier
    if method=='rf':
        clf = RandomForestClassifier(n_estimators=3, max_depth=None, n_jobs=-1, min_samples_split=4,
                                 random_state=0)
    elif method=='xgb':
        clf=XGBClassifier(learning_rate=0.04, n_estimators=10, objective='multi:softprob', max_depth=3, seed=0)
        
    elif method=='knn':
        def calculate_distance(distances):
            return distances ** -2

        
        numNeighbors=np.floor(np.sqrt(len(df_cell_train))/5.1282).astype(int)
#           numNeighbors=36
        clf=KNeighborsClassifier(n_neighbors=numNeighbors, weights=calculate_distance, metric='manhattan', n_jobs=-1)

    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    
    le_labels=np.argsort(y_pred, axis=1)[:, ::-1][:, :3]
    pred_labels = le.inverse_transform(le_labels)
    pred_confs=[y_pred[i,le_labels[i]] for i in xrange(len(y_pred))]

    return pred_labels, pred_confs, row_ids


def process_grid(df_train, df_test, th, method='rf', note=''):
    """
    Iterates over all grid cells, aggregates the results and makes the
    submission.
    """
    preds = np.zeros((df_test.shape[0], 3), dtype=int)
    confs = np.zeros((df_test.shape[0], 3), dtype=float)

    iterations_x = int(grid_size / x_step) # 20
    iterations_y = int(grid_size / y_step) # 40

    sTime=time.time()
    for i in range(iterations_x):
        print(i)
        x_min = x_step * i
        x_max = x_step * i + x_step
        x_min = round(x_min, 4)
        x_max = round(x_max, 4)
        if x_max == grid_size:
            x_max += 0.001

        for j in range(iterations_y):
            y_min = y_step * j
            y_max = y_step * j + y_step
            y_min = round(y_min, 4)
            y_max = round(y_max, 4)
            if y_max == grid_size:
                y_max += 0.001

            # Applying classifier to one grid cell
            pred_labels, pred_confs, row_ids = process_one_cell(df_train, df_test, th, x_min, y_min, x_max, y_max, method=method)

            # Updating predictions
            preds[row_ids] = pred_labels
            confs[row_ids]=pred_confs
            

            
        print time.time()-sTime
        sTime=time.time()
    if mode=='test':
        print 'Generating submission files'
        # Auxiliary dataframe with the 3 best predictions for each sample
        df_aux = pd.DataFrame(preds, dtype=str, columns=['pred0', 'pred1', 'pred2'])

        # Concatenating the 3 predictions for each sample
        ds_sub = df_aux.pred0.str.cat([df_aux.pred1, df_aux.pred2], sep=' ')

        # Writting to csv
        ds_sub.name = 'place_id'
        resultFile=time.strftime('%c')+'-'+method+'-'+note
        ds_sub.to_csv(resultFile+'submit.csv', index=True, header=True, index_label='row_id')

        df_confs = pd.DataFrame(confs, columns=['conf0', 'conf1', 'conf2'])
        df_confs = pd.concat([df_aux, df_confs], axis=1)  
        df_confs.to_csv(resultFile+'confidence.csv', index=True, index_label='row_id')
    elif mode=='valid':
        print 'Generating validation file'

In [113]:
print('Loading data')
df_train = pd.read_csv('../input/train.csv',
                       usecols=['row_id', 'x', 'y', 'accuracy', 'time', 'place_id'],
                       index_col=0)
df_test = pd.read_csv('../input/test.csv',
                      usecols=['row_id', 'x', 'y', 'accuracy', 'time'],
                      index_col=0)

div=int(0.7*len(df_train))
df_validation_train=df_train[:div]
df_validation_test=df_train[div:]
df_validation_test.reset_index(inplace=True)

Loading data


In [119]:
df_validation_test.head()
df_validation_train.head()

Unnamed: 0_level_0,x,y,accuracy,time,place_id
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.7941,9.0809,54,470702,8523065625
1,5.9567,4.7968,13,186555,1757726713
2,8.3078,7.0407,74,322648,1137537235
3,7.3665,2.5165,65,704587,6567393236
4,4.0961,1.1307,31,472130,7440663949


In [114]:
print('Preparing train data')
df_train = prepare_data(df_train)
print(df_train.shape)
# add data for periodic time that hit the boundary
pd.options.mode.chained_assignment = None
add_data = df_train[df_train.hour < 6]
add_data.hour += 96
df_train = df_train.append(add_data)

add_data = df_train[df_train.hour > 98]
add_data.hour -= 96
df_train = df_train.append(add_data)

print(df_train.shape)
print('Preparing test data')
df_test = prepare_data(df_test)

Preparing train data
(29118021, 8)
(30916803, 8)
Preparing test data


In [117]:
# Solving classification problems inside each grid cell
th = 8  # Keeping place_ids with more than th samples.
mode='valid'
method='knn'
note=''


if mode=='test':
    process_grid(df_train, df_test, th, method=method)
if mode=='valid':
    process_grid(df_validation_train, df_validation_test, th, method=method, note=note)

0


JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/runpy.py in _run_module_as_main(mod_name='ipykernel.__main__', alter_argv=1)
    157     pkg_name = mod_name.rpartition('.')[0]
    158     main_globals = sys.modules["__main__"].__dict__
    159     if alter_argv:
    160         sys.argv[0] = fname
    161     return _run_code(code, main_globals, None,
--> 162                      "__main__", fname, loader, pkg_name)
        fname = '/home/zhenjie/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py'
        loader = <pkgutil.ImpLoader instance>
        pkg_name = 'ipykernel'
    163 
    164 def run_module(mod_name, init_globals=None,
    165                run_name=None, alter_sys=False):
    166     """Execute a module's code without importing it

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/runpy.py in _run_code(code=<code object <module> at 0x7f7b102b0eb0, file "/...2.7/site-packages/ipykernel/__main__.py", line 1>, run_globals={'__builtins__': <module '__builtin__' (built-in)>, '__doc__': None, '__file__': '/home/zhenjie/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': 'ipykernel', 'app': <module 'ipykernel.kernelapp' from '/home/zhenji...python2.7/site-packages/ipykernel/kernelapp.pyc'>}, init_globals=None, mod_name='__main__', mod_fname='/home/zhenjie/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py', mod_loader=<pkgutil.ImpLoader instance>, pkg_name='ipykernel')
     67         run_globals.update(init_globals)
     68     run_globals.update(__name__ = mod_name,
     69                        __file__ = mod_fname,
     70                        __loader__ = mod_loader,
     71                        __package__ = pkg_name)
---> 72     exec code in run_globals
        code = <code object <module> at 0x7f7b102b0eb0, file "/...2.7/site-packages/ipykernel/__main__.py", line 1>
        run_globals = {'__builtins__': <module '__builtin__' (built-in)>, '__doc__': None, '__file__': '/home/zhenjie/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': 'ipykernel', 'app': <module 'ipykernel.kernelapp' from '/home/zhenji...python2.7/site-packages/ipykernel/kernelapp.pyc'>}
     73     return run_globals
     74 
     75 def _run_module_code(code, init_globals=None,
     76                     mod_name=None, mod_fname=None,

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py in <module>()
      1 
      2 
----> 3 
      4 if __name__ == '__main__':
      5     from ipykernel import kernelapp as app
      6     app.launch_new_instance()
      7 
      8 
      9 
     10 

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    591         
    592         If a global instance already exists, this reinitializes and starts it
    593         """
    594         app = cls.instance(**kwargs)
    595         app.initialize(argv)
--> 596         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    597 
    598 #-----------------------------------------------------------------------------
    599 # utility functions, for convenience
    600 #-----------------------------------------------------------------------------

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    437         
    438         if self.poller is not None:
    439             self.poller.start()
    440         self.kernel.start()
    441         try:
--> 442             ioloop.IOLoop.instance().start()
    443         except KeyboardInterrupt:
    444             pass
    445 
    446 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    157             PollIOLoop.configure(ZMQIOLoop)
    158         return PollIOLoop.current(*args, **kwargs)
    159     
    160     def start(self):
    161         try:
--> 162             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    163         except ZMQError as e:
    164             if e.errno == ETERM:
    165                 # quietly return on ETERM
    166                 pass

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    878                 self._events.update(event_pairs)
    879                 while self._events:
    880                     fd, events = self._events.popitem()
    881                     try:
    882                         fd_obj, handler_func = self._handlers[fd]
--> 883                         handler_func(fd_obj, events)
        handler_func = <function null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    884                     except (OSError, IOError) as e:
    885                         if errno_from_exception(e) == errno.EPIPE:
    886                             # Happens when the client closes the connection
    887                             pass

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    271         if self.control_stream:
    272             self.control_stream.on_recv(self.dispatch_control, copy=False)
    273 
    274         def make_dispatcher(stream):
    275             def dispatcher(msg):
--> 276                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    277             return dispatcher
    278 
    279         for s in self.shell_streams:
    280             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {u'allow_stdin': True, u'code': u"# Solving classification problems inside each ...f_validation_test, th, method=method, note=note)", u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {'date': '2016-06-29T01:11:48.983282', u'msg_id': u'EE7A3F99F75F431285610E9F8D679F58', u'msg_type': u'execute_request', u'session': u'A56F275B17704CA8AE2106EF95721D45', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'EE7A3F99F75F431285610E9F8D679F58', 'msg_type': u'execute_request', 'parent_header': {}})
    223             self.log.error("UNKNOWN MESSAGE TYPE: %r", msg_type)
    224         else:
    225             self.log.debug("%s: %s", msg_type, msg)
    226             self.pre_handler_hook()
    227             try:
--> 228                 handler(stream, idents, msg)
        handler = <bound method IPythonKernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = ['A56F275B17704CA8AE2106EF95721D45']
        msg = {'buffers': [], 'content': {u'allow_stdin': True, u'code': u"# Solving classification problems inside each ...f_validation_test, th, method=method, note=note)", u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {'date': '2016-06-29T01:11:48.983282', u'msg_id': u'EE7A3F99F75F431285610E9F8D679F58', u'msg_type': u'execute_request', u'session': u'A56F275B17704CA8AE2106EF95721D45', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'EE7A3F99F75F431285610E9F8D679F58', 'msg_type': u'execute_request', 'parent_header': {}}
    229             except Exception:
    230                 self.log.error("Exception in message handler:", exc_info=True)
    231             finally:
    232                 self.post_handler_hook()

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=['A56F275B17704CA8AE2106EF95721D45'], parent={'buffers': [], 'content': {u'allow_stdin': True, u'code': u"# Solving classification problems inside each ...f_validation_test, th, method=method, note=note)", u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {'date': '2016-06-29T01:11:48.983282', u'msg_id': u'EE7A3F99F75F431285610E9F8D679F58', u'msg_type': u'execute_request', u'session': u'A56F275B17704CA8AE2106EF95721D45', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'EE7A3F99F75F431285610E9F8D679F58', 'msg_type': u'execute_request', 'parent_header': {}})
    386         if not silent:
    387             self.execution_count += 1
    388             self._publish_execute_input(code, parent, self.execution_count)
    389 
    390         reply_content = self.do_execute(code, silent, store_history,
--> 391                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    392 
    393         # Flush output before sending the reply.
    394         sys.stdout.flush()
    395         sys.stderr.flush()

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code=u"# Solving classification problems inside each ...f_validation_test, th, method=method, note=note)", silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    194 
    195         reply_content = {}
    196         # FIXME: the shell calls the exception handler itself.
    197         shell._reply_content = None
    198         try:
--> 199             shell.run_cell(code, store_history=store_history, silent=silent)
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = u"# Solving classification problems inside each ...f_validation_test, th, method=method, note=note)"
        store_history = True
        silent = False
    200         except:
    201             status = u'error'
    202             # FIXME: this code right now isn't being used yet by default,
    203             # because the run_cell() call above directly fires off exception

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell=u"# Solving classification problems inside each ...f_validation_test, th, method=method, note=note)", store_history=True, silent=False, shell_futures=True)
   2718                 self.displayhook.exec_result = result
   2719 
   2720                 # Execute the user code
   2721                 interactivity = "none" if silent else self.ast_node_interactivity
   2722                 self.run_ast_nodes(code_ast.body, cell_name,
-> 2723                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler instance>
   2724 
   2725                 # Reset this so later displayed values do not modify the
   2726                 # ExecutionResult
   2727                 self.displayhook.exec_result = None

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.If object>, <_ast.If object>], cell_name='<ipython-input-117-191b5e24ef8f>', interactivity='none', compiler=<IPython.core.compilerop.CachingCompiler instance>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   2820 
   2821         try:
   2822             for i, node in enumerate(to_run_exec):
   2823                 mod = ast.Module([node])
   2824                 code = compiler(mod, cell_name, "exec")
-> 2825                 if self.run_code(code, result):
        self.run_code = <bound method ZMQInteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x7f7a31013eb0, file "<ipython-input-117-191b5e24ef8f>", line 10>
        result = <IPython.core.interactiveshell.ExecutionResult object>
   2826                     return True
   2827 
   2828             for i, node in enumerate(to_run_interactive):
   2829                 mod = ast.Interactive([node])

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x7f7a31013eb0, file "<ipython-input-117-191b5e24ef8f>", line 10>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   2880         outflag = 1  # happens in more places, so it's easier as default
   2881         try:
   2882             try:
   2883                 self.hooks.pre_run_code_hook()
   2884                 #rprint('Running code', repr(code_obj)) # dbg
-> 2885                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x7f7a31013eb0, file "<ipython-input-117-191b5e24ef8f>", line 10>
        self.user_global_ns = {'In': ['', u'import numpy as np\nimport pandas as pd\nimpor...h)\n\n\nif __name__ == \'__main__\':\n    main()', u'import numpy as np\nimport pandas as pd\nimpor...h)\n\n\nif __name__ == \'__main__\':\n    main()', u'import numpy as np\nimport pandas as pd\nimpor...h)\n\n\nif __name__ == \'__main__\':\n    main()', u'import numpy as np\nimport pandas as pd\nimpor...h)\n\n\nif __name__ == \'__main__\':\n    main()', u"confs=pd.read_csv('pred_conf.csv')", u'confs.head()', u'import numpy as np\nimport pandas as pd\nimpor...h)\n\n\nif __name__ == \'__main__\':\n    main()', u'import numpy as np\nimport pandas as pd\nimpor...h)\n\n\nif __name__ == \'__main__\':\n    main()', u'import numpy as np\nimport pandas as pd\nimpor...h)\n\n\nif __name__ == \'__main__\':\n    main()', u'def prepare_data(df):\n    """\n    Feature en..._conf.csv\', index=True, index_label=\'row_id\')', u'# Solving classification problems inside each ...th samples.\nprocess_grid(df_train, df_test, th)', u'import numpy as np\nimport pandas as pd\nimpor...0\ngrid_size = 10.0\nx_step = 0.5\ny_step = 0.25', u'def prepare_data(df):\n    """\n    Feature en..._conf.csv\', index=True, index_label=\'row_id\')', u"print('Loading data')\ndf_train = pd.read_csv(...y', 'time'],\n                      index_col=0)", u"print('Preparing train data')\ndf_train = prep...ing test data')\ndf_test = prepare_data(df_test)", u"print('Preparing train data')\ndf_train = prep...ing test data')\ndf_test = prepare_data(df_test)", u'df_train.head()', u'# Solving classification problems inside each ...th samples.\nprocess_grid(df_train, df_test, th)', u'def prepare_data(df):\n    """\n    Feature en..._conf.csv\', index=True, index_label=\'row_id\')', ...], 'KNeighborsClassifier': <class 'sklearn.neighbors.classification.KNeighborsClassifier'>, 'LabelEncoder': <class 'sklearn.preprocessing.label.LabelEncoder'>, 'Out': {6:    row_id          l1          l2          l3  c...4  3661789744  2634800426  4764406629   0   0   0, 17:              x       y   accuracy    place_id   ...3617  7440663949  87.333333     18.0   22.0  10.0, 27:    row_id            l1            l2           ...00000  
2  0.000000  
3  0.000000  
4  0.000000  , 35:    row_id        c1        c2        c3
0       ... 0.083333
9       9  0.222222  0.166667  0.166667, 40:    row_id   c1   c2   c3
0       0  0.0  0.0  0....8       8  0.0  0.0  0.0
9       9  0.0  0.0  0.0, 41:     row_id   c1   c2   c3
0        0  0.0  0.0  ...      18  0.0  0.0  0.0
19      19  0.0  0.0  0.0, 42:      row_id        c1        c2        c3
0     ...00000  0.000000  0.000000

[200 rows x 4 columns], 48:      row_id          l1          l2          l3 ...00000  0.000000  0.000000

[200 rows x 7 columns], 52:    row_id          l1          l2          l3   ...4800426  4764406629  0.476190  0.222222  0.111111, 56:    row_id       pred0       pred1       pred2   ...4800426  4764406629  0.476190  0.222222  0.111111, ...}, 'RandomForestClassifier': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, 'XGBClassifier': <class 'xgboost.sklearn.XGBClassifier'>, '_':      row_id                          place_id
0 ...                    0 0 0

[200 rows x 2 columns], '_17':              x       y   accuracy    place_id   ...3617  7440663949  87.333333     18.0   22.0  10.0, '_27':    row_id            l1            l2           ...00000  
2  0.000000  
3  0.000000  
4  0.000000  , '_35':    row_id        c1        c2        c3
0       ... 0.083333
9       9  0.222222  0.166667  0.166667, ...}
        self.user_ns = {'In': ['', u'import numpy as np\nimport pandas as pd\nimpor...h)\n\n\nif __name__ == \'__main__\':\n    main()', u'import numpy as np\nimport pandas as pd\nimpor...h)\n\n\nif __name__ == \'__main__\':\n    main()', u'import numpy as np\nimport pandas as pd\nimpor...h)\n\n\nif __name__ == \'__main__\':\n    main()', u'import numpy as np\nimport pandas as pd\nimpor...h)\n\n\nif __name__ == \'__main__\':\n    main()', u"confs=pd.read_csv('pred_conf.csv')", u'confs.head()', u'import numpy as np\nimport pandas as pd\nimpor...h)\n\n\nif __name__ == \'__main__\':\n    main()', u'import numpy as np\nimport pandas as pd\nimpor...h)\n\n\nif __name__ == \'__main__\':\n    main()', u'import numpy as np\nimport pandas as pd\nimpor...h)\n\n\nif __name__ == \'__main__\':\n    main()', u'def prepare_data(df):\n    """\n    Feature en..._conf.csv\', index=True, index_label=\'row_id\')', u'# Solving classification problems inside each ...th samples.\nprocess_grid(df_train, df_test, th)', u'import numpy as np\nimport pandas as pd\nimpor...0\ngrid_size = 10.0\nx_step = 0.5\ny_step = 0.25', u'def prepare_data(df):\n    """\n    Feature en..._conf.csv\', index=True, index_label=\'row_id\')', u"print('Loading data')\ndf_train = pd.read_csv(...y', 'time'],\n                      index_col=0)", u"print('Preparing train data')\ndf_train = prep...ing test data')\ndf_test = prepare_data(df_test)", u"print('Preparing train data')\ndf_train = prep...ing test data')\ndf_test = prepare_data(df_test)", u'df_train.head()', u'# Solving classification problems inside each ...th samples.\nprocess_grid(df_train, df_test, th)', u'def prepare_data(df):\n    """\n    Feature en..._conf.csv\', index=True, index_label=\'row_id\')', ...], 'KNeighborsClassifier': <class 'sklearn.neighbors.classification.KNeighborsClassifier'>, 'LabelEncoder': <class 'sklearn.preprocessing.label.LabelEncoder'>, 'Out': {6:    row_id          l1          l2          l3  c...4  3661789744  2634800426  4764406629   0   0   0, 17:              x       y   accuracy    place_id   ...3617  7440663949  87.333333     18.0   22.0  10.0, 27:    row_id            l1            l2           ...00000  
2  0.000000  
3  0.000000  
4  0.000000  , 35:    row_id        c1        c2        c3
0       ... 0.083333
9       9  0.222222  0.166667  0.166667, 40:    row_id   c1   c2   c3
0       0  0.0  0.0  0....8       8  0.0  0.0  0.0
9       9  0.0  0.0  0.0, 41:     row_id   c1   c2   c3
0        0  0.0  0.0  ...      18  0.0  0.0  0.0
19      19  0.0  0.0  0.0, 42:      row_id        c1        c2        c3
0     ...00000  0.000000  0.000000

[200 rows x 4 columns], 48:      row_id          l1          l2          l3 ...00000  0.000000  0.000000

[200 rows x 7 columns], 52:    row_id          l1          l2          l3   ...4800426  4764406629  0.476190  0.222222  0.111111, 56:    row_id       pred0       pred1       pred2   ...4800426  4764406629  0.476190  0.222222  0.111111, ...}, 'RandomForestClassifier': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, 'XGBClassifier': <class 'xgboost.sklearn.XGBClassifier'>, '_':      row_id                          place_id
0 ...                    0 0 0

[200 rows x 2 columns], '_17':              x       y   accuracy    place_id   ...3617  7440663949  87.333333     18.0   22.0  10.0, '_27':    row_id            l1            l2           ...00000  
2  0.000000  
3  0.000000  
4  0.000000  , '_35':    row_id        c1        c2        c3
0       ... 0.083333
9       9  0.222222  0.166667  0.166667, ...}
   2886             finally:
   2887                 # Reset our crash handler in place
   2888                 sys.excepthook = old_excepthook
   2889         except SystemExit as e:

...........................................................................
/home/zhenjie/kaggle/facebookCheckIns/models/<ipython-input-117-191b5e24ef8f> in <module>()
      6 
      7 
      8 if mode=='test':
      9     process_grid(df_train, df_test, th, method=method)
     10 if mode=='valid':
---> 11     process_grid(df_validation_train, df_validation_test, th, method=method, note=note)
     12 
     13 
     14 
     15 

...........................................................................
/home/zhenjie/kaggle/facebookCheckIns/models/<ipython-input-116-73568381af0a> in process_grid(df_train=               x       y  accuracy    time    pl...  690137  2571942010

[20382614 rows x 5 columns], df_test=           row_id       x       y  accuracy    t...7  102842  7028698129

[8735407 rows x 6 columns], th=8, method='knn', note='')
    109             y_max = round(y_max, 4)
    110             if y_max == grid_size:
    111                 y_max += 0.001
    112 
    113             # Applying classifier to one grid cell
--> 114             pred_labels, pred_confs, row_ids = process_one_cell(df_train, df_test, th, x_min, y_min, x_max, y_max, method=method)
    115 
    116             # Updating predictions
    117             preds[row_ids] = pred_labels
    118             confs[row_ids]=pred_confs

...........................................................................
/home/zhenjie/kaggle/facebookCheckIns/models/<ipython-input-116-73568381af0a> in process_one_cell(df_train=               x       y  accuracy    time    pl...  690137  2571942010

[20382614 rows x 5 columns], df_test=           row_id       x       y  accuracy    t...7  102842  7028698129

[8735407 rows x 6 columns], th=8, x_min=0.0, y_min=0.0, x_max=0.5, y_max=0.25, method='knn')
     70         numNeighbors=np.floor(np.sqrt(len(df_cell_train))/5.1282).astype(int)
     71 #           numNeighbors=36
     72         clf=KNeighborsClassifier(n_neighbors=numNeighbors, weights=calculate_distance, metric='manhattan', n_jobs=-1)
     73 
     74     clf.fit(X, y)
---> 75     y_pred = clf.predict_proba(X_test)
     76     
     77     le_labels=np.argsort(y_pred, axis=1)[:, ::-1][:, :3]
     78     pred_labels = le.inverse_transform(le_labels)
     79     pred_confs=[y_pred[i,le_labels[i]] for i in xrange(len(y_pred))]

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/sklearn/neighbors/classification.py in predict_proba(self=KNeighborsClassifier(algorithm='auto', leaf_size...=<function calculate_distance at 0x7f79f192d1b8>), X=array([[  2.03833650e+07,   1.66950000e+02,   1....01,
          1.20000000e+01,   3.93765000e+05]]))
    187             The class probabilities of the input samples. Classes are ordered
    188             by lexicographic order.
    189         """
    190         X = check_array(X, accept_sparse='csr')
    191 
--> 192         neigh_dist, neigh_ind = self.kneighbors(X)
        neigh_dist = undefined
        neigh_ind = undefined
        self.kneighbors = <bound method KNeighborsClassifier.kneighbors of...<function calculate_distance at 0x7f79f192d1b8>)>
        X = array([[  2.03833650e+07,   1.66950000e+02,   1....01,
          1.20000000e+01,   3.93765000e+05]])
    193 
    194         classes_ = self.classes_
    195         _y = self._y
    196         if not self.outputs_2d_:

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/sklearn/neighbors/base.py in kneighbors(self=KNeighborsClassifier(algorithm='auto', leaf_size...=<function calculate_distance at 0x7f79f192d1b8>), X=array([[  2.03833650e+07,   1.66950000e+02,   1....01,
          1.20000000e+01,   3.93765000e+05]]), n_neighbors=33, return_distance=True)
    394                     "%s does not work with sparse matrices. Densify the data, "
    395                     "or set algorithm='brute'" % self._fit_method)
    396             result = Parallel(n_jobs, backend='threading')(
    397                 delayed(self._tree.query, check_pickle=False)(
    398                     X[s], n_neighbors, return_distance)
--> 399                 for s in gen_even_slices(X.shape[0], n_jobs)
        X.shape = (13521, 5)
        n_jobs = 8
    400             )
    401             if return_distance:
    402                 dist, neigh_ind = tuple(zip(*result))
    403                 result = np.vstack(dist), np.vstack(neigh_ind)

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=8), iterable=<generator object <genexpr>>)
    805             if pre_dispatch == "all" or n_jobs == 1:
    806                 # The iterable was consumed all at once by the above for loop.
    807                 # No need to wait for async callbacks to trigger to
    808                 # consumption.
    809                 self._iterating = False
--> 810             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=8)>
    811             # Make sure that we get a last message telling us we are done
    812             elapsed_time = time.time() - self._start_time
    813             self._print('Done %3i out of %3i | elapsed: %s finished',
    814                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Wed Jun 29 01:11:49 2016
PID: 6620                 Python 2.7.11: /home/zhenjie/anaconda2/bin/python
...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
     67     def __init__(self, iterator_slice):
     68         self.items = list(iterator_slice)
     69         self._size = len(self.items)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <built-in method query of sklearn.neighbors.kd_tree.KDTree object>
        args = (array([[  2.03833650e+07,   1.66950000e+02,   1....02,
          5.10000000e+01,   7.25787000e+05]]), 33, True)
        kwargs = {}
        self.items = [(<built-in method query of sklearn.neighbors.kd_tree.KDTree object>, (array([[  2.03833650e+07,   1.66950000e+02,   1....02,
          5.10000000e+01,   7.25787000e+05]]), 33, True), {})]
     73 
     74     def __len__(self):
     75         return self._size
     76 

...........................................................................
/home/zhenjie/anaconda2/lib/python2.7/site-packages/sklearn/neighbors/kd_tree.so in sklearn.neighbors.kd_tree.BinaryTree.query (sklearn/neighbors/kd_tree.c:10451)()
   1290 
   1291 
   1292 
   1293 
   1294 
-> 1295 
   1296 
   1297 
   1298 
   1299 

ValueError: query data dimension must match training data dimension
___________________________________________________________________________

In [95]:
confs=pd.read_csv('Wed Jun 29 00:43:39 2016-knn-submit.csv')

In [96]:
confs.head(200)

Unnamed: 0,row_id,place_id
0,0,0 0 0
1,1,0 0 0
2,2,0 0 0
3,3,0 0 0
4,4,0 0 0
5,5,8370753254 9727638738 4322188315
6,6,0 0 0
7,7,0 0 0
8,8,0 0 0
9,9,0 0 0
