# BUG: TopK issues #1918

- issue: https://github.com/ibis-project/ibis/issues/1918
- doc: https://docs.ibis-project.org/sql.html?highlight=topk#top-k-operations

Problems



In [1]:
import os
from pathlib import Path 
import traceback

import ibis
import pandas as pd

# local settings
from settings import conf, data_directory, post_connection_spark

In [2]:
# data directory
assert os.path.exists(data_directory)
assert os.path.exists(data_directory / 'ibis_testing.db')

# connection settings
post_connection = dict(
    spark=post_connection_spark,
    pyspark= post_connection_spark
)

## Check Top 3 operation

In [3]:
for backend_name in conf:
    print('\n#', '=' * 80)
    print('#', backend_name.center(80, '.'))
    print('#', '=' * 80)
    
    # connection
    con = getattr(ibis, backend_name).connect(**conf[backend_name])
    if backend_name in post_connection:
        post_connection[backend_name](con)
    
    alltypes = con.table('functional_alltypes')
    alltypes = alltypes.sort_by(alltypes.string_col)
    expr = alltypes.string_col.topk(3)
    try:
        print(expr.compile())
        display(expr.execute())
    except Exception as e:
        if isinstance(e, NotImplementedError):
            print('[II] NotImplementedError')
        else:
            print('[EE]', e)


# ...................................omniscidb....................................
SELECT *
FROM (
  SELECT "string_col", count("string_col") AS "count"
  FROM functional_alltypes
  GROUP BY string_col
  ORDER BY "string_col"
) t0
ORDER BY "count" DESC
LIMIT 3


Unnamed: 0,string_col,count
0,6,730
1,2,730
2,0,730



# .....................................pandas.....................................
ref_0
PandasTable[table]
  name: functional_alltypes
  schema:
    index : int64
    Unnamed: 0 : int64
    id : int64
    bool_col : boolean
    tinyint_col : int64
    smallint_col : int64
    int_col : int64
    bigint_col : int64
    float_col : float64
    double_col : float64
    date_string_col : string
    string_col : string
    timestamp_col : timestamp
    year : int64
    month : int64

ref_1
Selection[table]
  table:
    Table: ref_0
  sort_keys:
    SortKey[array-sort]
      expr:
        string_col = Column[string*] 'string_col' from table
          ref_0
      ascending:
        True

TopK[topk]
  string_col = Column[string*] 'string_col' from table
    ref_1
  k:
    3
  by:
    count = Count[int64]
      string_col = Column[string*] 'string_col' from table
        ref_1
      where:
        None
[II] NotImplementedError

# ......................................csv......................

Unnamed: 0,string_col,count
0,0,730
1,1,730
2,2,730



# ....................................postgres....................................
SELECT t0.string_col, t0.count 
FROM (SELECT t1.string_col AS string_col, count(t1.string_col) AS count 
FROM functional_alltypes AS t1 GROUP BY t1.string_col ORDER BY t1.string_col) AS t0 ORDER BY t0.count DESC 
 LIMIT %(param_1)s


Unnamed: 0,string_col,count
0,1,730
1,2,730
2,0,730



# .....................................mysql......................................
SELECT t0.string_col, t0.count 
FROM (SELECT t1.string_col AS string_col, count(t1.string_col) AS count 
FROM functional_alltypes AS t1 GROUP BY t1.string_col ORDER BY t1.string_col) AS t0 ORDER BY t0.count DESC 
 LIMIT %(param_1)s


Unnamed: 0,string_col,count
0,0,730
1,1,730
2,2,730



# ...................................clickhouse...................................
SELECT *
FROM (
  SELECT `string_col`, count(`string_col`) AS `count`
  FROM ibis_testing.`functional_alltypes`
  GROUP BY `string_col`
  ORDER BY `string_col`
) t0
ORDER BY `count` DESC
LIMIT 3


Unnamed: 0,string_col,count
0,0,730
1,1,730
2,2,730



# .....................................impala.....................................
SELECT *
FROM (
  SELECT `string_col`, count(`string_col`) AS `count`
  FROM ibis_testing.`functional_alltypes`
  GROUP BY 1
  ORDER BY `string_col`
) t0
ORDER BY `count` DESC
LIMIT 3


Unnamed: 0,string_col,count
0,3,730
1,1,730
2,4,730



# ....................................pyspark.....................................
[EE] No translation rule for <class 'ibis.expr.operations.TopK'>

# .....................................spark......................................
SELECT *
FROM (
  SELECT `string_col`, count(`string_col`) AS `count`
  FROM functional_alltypes
  GROUP BY 1
  ORDER BY `string_col`
) t0
ORDER BY `count` DESC
LIMIT 3


Unnamed: 0,string_col,count
0,0,730
1,3,730
2,8,730


## Check Top 3 operation as table filter

In [4]:
# case 1: SQLite
con = ibis.sqlite.connect(str(data_directory / 'ibis_testing.db'))

alltypes = con.table('functional_alltypes')
alltypes = alltypes.sort_by(alltypes.string_col)
expr = alltypes[alltypes.string_col.topk(3)]

print('>', expr.compile())
# high time / memory consumption
# display(expr.execute())

> SELECT t0."index", t0."Unnamed: 0", t0.id, t0.bool_col, t0.tinyint_col, t0.smallint_col, t0.int_col, t0.bigint_col, t0.float_col, t0.double_col, t0.date_string_col, t0.string_col, t0.timestamp_col, t0.year, t0.month 
FROM (SELECT t2."index" AS "index", t2."Unnamed: 0" AS "Unnamed: 0", t2.id AS id, t2.bool_col AS bool_col, t2.tinyint_col AS tinyint_col, t2.smallint_col AS smallint_col, t2.int_col AS int_col, t2.bigint_col AS bigint_col, t2.float_col AS float_col, t2.double_col AS double_col, t2.date_string_col AS date_string_col, t2.string_col AS string_col, t2.timestamp_col AS timestamp_col, t2.year AS year, t2.month AS month 
FROM base.functional_alltypes AS t2 ORDER BY t2.string_col) AS t0, (SELECT t0."index" AS "index", t0."Unnamed: 0" AS "Unnamed: 0", t0.id AS id, t0.bool_col AS bool_col, t0.tinyint_col AS tinyint_col, t0.smallint_col AS smallint_col, t0.int_col AS int_col, t0.bigint_col AS bigint_col, t0.float_col AS float_col, t0.double_col AS double_col, t0.date_string_col AS 

In [5]:
for backend_name in set(conf.keys()) - {'sqlite'}:
    print('\n#', '=' * 80)
    print('#', backend_name.center(80, '.'))
    print('#', '=' * 80)
    
    # connection
    con = getattr(ibis, backend_name).connect(**conf[backend_name])
    if backend_name in post_connection:
        post_connection[backend_name](con)
    
    alltypes = con.table('functional_alltypes')
    expr = alltypes[alltypes.string_col.topk(3)]
    
    try:
        print('>', expr.compile())
        display(expr.execute())
    except Exception as e:
        if isinstance(e, NotImplementedError):
            print('[II] NotImplementedError')
        else:
            print('[EE]', traceback.format_exc())


# ...................................clickhouse...................................
[EE] Traceback (most recent call last):
  File "<ipython-input-5-24e49116336e>", line 15, in <module>
    print('>', expr.compile())
  File "/home/xmn/dev/quansight/ibis-project/ibis/ibis/expr/types.py", line 215, in compile
    return compile(self, limit=limit, params=params)
  File "/home/xmn/dev/quansight/ibis-project/ibis/ibis/client.py", line 370, in compile
    return backend.compile(expr, limit=limit, params=params, **kwargs)
  File "/home/xmn/dev/quansight/ibis-project/ibis/ibis/client.py", line 233, in compile
    return query_ast.compile()
  File "/home/xmn/dev/quansight/ibis-project/ibis/ibis/sql/compiler.py", line 54, in compile
    compiled_queries = [q.compile() for q in self.queries]
  File "/home/xmn/dev/quansight/ibis-project/ibis/ibis/sql/compiler.py", line 54, in <listcomp>
    compiled_queries = [q.compile() for q in self.queries]
  File "/home/xmn/dev/quansight/ibis-project/ibis/ibi

Unnamed: 0.1,index,Unnamed: 0,id,bool_col,tinyint_col,smallint_col,int_col,bigint_col,float_col,double_col,date_string_col,string_col,timestamp_col,year,month
0,3,3,6693,False,3,3,3,30,3.3,30.3,11/01/10,3,2010-11-01 04:03:00.300,2010,11
1,7,7,6697,False,7,7,7,70,7.7,70.7,11/01/10,7,2010-11-01 04:07:00.210,2010,11
2,8,8,6698,True,8,8,8,80,8.8,80.8,11/01/10,8,2010-11-01 04:08:00.280,2010,11
3,13,13,6703,False,3,3,3,30,3.3,30.3,11/02/10,3,2010-11-02 04:13:00.480,2010,11
4,17,17,6707,False,7,7,7,70,7.7,70.7,11/02/10,7,2010-11-02 04:17:00.660,2010,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,7287,7287,3947,False,7,7,7,70,7.7,70.7,01/30/10,7,2010-01-30 08:57:13.260,2010,1
2186,7288,7288,3948,True,8,8,8,80,8.8,80.8,01/30/10,8,2010-01-30 08:58:13.330,2010,1
2187,7293,7293,3953,False,3,3,3,30,3.3,30.3,01/31/10,3,2010-01-31 09:03:13.530,2010,1
2188,7297,7297,3957,False,7,7,7,70,7.7,70.7,01/31/10,7,2010-01-31 09:07:13.710,2010,1



# ....................................parquet.....................................
> ref_0
ParquetTable[table]
  name: functional_alltypes
  schema:
    index : int64
    Unnamed: 0 : int64
    id : int64
    bool_col : boolean
    tinyint_col : int8
    smallint_col : int16
    int_col : int32
    bigint_col : int64
    float_col : float32
    double_col : float64
    date_string_col : string
    string_col : string
    timestamp_col : timestamp
    year : int64
    month : int64

Selection[table]
  table:
    Table: ref_0
  predicates:
    SummaryFilter[boolean*]
      expr:
        TopK[topk]
          string_col = Column[string*] 'string_col' from table
            ref_0
          k:
            3
          by:
            count = Count[int64]
              string_col = Column[string*] 'string_col' from table
                ref_0
              where:
                None
[II] NotImplementedError

# ....................................postgres....................................
>

Unnamed: 0,index,Unnamed__0,id,bool_col,tinyint_col,smallint_col,int_col,bigint_col,float_col,double_col,date_string_col,string_col,timestamp_col,year_,month_,rowid
0,0,0,6690,True,0,0,0,0,0.0,0.0,11/01/10,0,2010-11-01 00:00:00,2010,11,0
1,2,2,6692,True,2,2,2,20,2.2,20.2,11/01/10,2,2010-11-01 00:02:00,2010,11,2
2,6,6,6696,True,6,6,6,60,6.6,60.6,11/01/10,6,2010-11-01 00:06:00,2010,11,6
3,10,10,6700,True,0,0,0,0,0.0,0.0,11/02/10,0,2010-11-02 00:10:00,2010,11,10
4,12,12,6702,True,2,2,2,20,2.2,20.2,11/02/10,2,2010-11-02 00:12:00,2010,11,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,7282,7282,3942,True,2,2,2,20,2.2,20.2,01/30/10,2,2010-01-30 04:52:13,2010,1,7282
2186,7286,7286,3946,True,6,6,6,60,6.6,60.6,01/30/10,6,2010-01-30 04:56:13,2010,1,7286
2187,7290,7290,3950,True,0,0,0,0,0.0,0.0,01/31/10,0,2010-01-31 05:00:13,2010,1,7290
2188,7292,7292,3952,True,2,2,2,20,2.2,20.2,01/31/10,2,2010-01-31 05:02:13,2010,1,7292



# .....................................mysql......................................
> SELECT t0.`index`, t0.`Unnamed: 0`, t0.id, t0.bool_col, t0.tinyint_col, t0.smallint_col, t0.int_col, t0.bigint_col, t0.float_col, t0.double_col, t0.date_string_col, t0.string_col, t0.timestamp_col, t0.year, t0.month 
FROM functional_alltypes AS t0, (SELECT t0.`index` AS `index`, t0.`Unnamed: 0` AS `Unnamed: 0`, t0.id AS id, t0.bool_col AS bool_col, t0.tinyint_col AS tinyint_col, t0.smallint_col AS smallint_col, t0.int_col AS int_col, t0.bigint_col AS bigint_col, t0.float_col AS float_col, t0.double_col AS double_col, t0.date_string_col AS date_string_col, t0.string_col AS string_col, t0.timestamp_col AS timestamp_col, t0.year AS year, t0.month AS month 
FROM functional_alltypes AS t0 
WHERE EXISTS (SELECT 1 
FROM (SELECT t2.string_col AS string_col, t2.count AS count 
FROM (SELECT t0.string_col AS string_col, count(t0.string_col) AS count 
FROM functional_alltypes AS t0 GROUP BY t0.string_col) AS t2 O

Unnamed: 0,id,bool_col,tinyint_col,smallint_col,int_col,bigint_col,float_col,double_col,date_string_col,string_col,timestamp_col,year,month
0,5773,False,3,3,3,30,3.3,30.3,08/01/10,3,2010-08-01 00:03:00.300,2010,8
1,5776,True,6,6,6,60,6.6,60.6,08/01/10,6,2010-08-01 00:06:00.150,2010,8
2,5779,False,9,9,9,90,9.9,90.9,08/01/10,9,2010-08-01 00:09:00.360,2010,8
3,5783,False,3,3,3,30,3.3,30.3,08/02/10,3,2010-08-02 00:13:00.480,2010,8
4,5786,True,6,6,6,60,6.6,60.6,08/02/10,6,2010-08-02 00:16:00.600,2010,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,576,True,6,6,6,60,6.6,60.6,02/27/09,6,2009-02-27 04:26:11.850,2009,2
2186,579,False,9,9,9,90,9.9,90.9,02/27/09,9,2009-02-27 04:29:12.600,2009,2
2187,583,False,3,3,3,30,3.3,30.3,02/28/09,3,2009-02-28 04:33:12.180,2009,2
2188,586,True,6,6,6,60,6.6,60.6,02/28/09,6,2009-02-28 04:36:12.300,2009,2



# .....................................pandas.....................................
> ref_0
PandasTable[table]
  name: functional_alltypes
  schema:
    index : int64
    Unnamed: 0 : int64
    id : int64
    bool_col : boolean
    tinyint_col : int64
    smallint_col : int64
    int_col : int64
    bigint_col : int64
    float_col : float64
    double_col : float64
    date_string_col : string
    string_col : string
    timestamp_col : timestamp
    year : int64
    month : int64

Selection[table]
  table:
    Table: ref_0
  predicates:
    SummaryFilter[boolean*]
      expr:
        TopK[topk]
          string_col = Column[string*] 'string_col' from table
            ref_0
          k:
            3
          by:
            count = Count[int64]
              string_col = Column[string*] 'string_col' from table
                ref_0
              where:
                None
[II] NotImplementedError
