In [1]:
import os
import cupy as cp
import pandas as pd
import cudf
import dask_cudf

s = cudf.Series([1,2,3,None,4])
s


0       1
1       2
2       3
3    <NA>
4       4
dtype: int64

In [2]:
ds = dask_cudf.from_cudf(s, npartitions=2)
ds.compute()

0       1
1       2
2       3
3    <NA>
4       4
dtype: int64

In [3]:
df = cudf.DataFrame({'a': list(range(20)),
                     'b': list(reversed(range(20))),
                     'c': list(range(20))
                    })
df


Unnamed: 0,a,b,c
0,0,19,0
1,1,18,1
2,2,17,2
3,3,16,3
4,4,15,4
5,5,14,5
6,6,13,6
7,7,12,7
8,8,11,8
9,9,10,9


In [4]:
ddf = dask_cudf.from_cudf(df, npartitions=2)
ddf.compute()


Unnamed: 0,a,b,c
0,0,19,0
1,1,18,1
2,2,17,2
3,3,16,3
4,4,15,4
5,5,14,5
6,6,13,6
7,7,12,7
8,8,11,8
9,9,10,9


In [5]:
pdf = pd.DataFrame({'a': [0, 1, 2, 3],'b': [0.1, 0.2, None, 0.3]})
gdf = cudf.DataFrame.from_pandas(pdf)
gdf


Unnamed: 0,a,b
0,0,0.1
1,1,0.2
2,2,
3,3,0.3


In [6]:
dask_gdf = dask_cudf.from_cudf(gdf, npartitions=2)
dask_gdf.compute()

Unnamed: 0,a,b
0,0,0.1
1,1,0.2
2,2,
3,3,0.3


In [7]:
df.head(2)

Unnamed: 0,a,b,c
0,0,19,0
1,1,18,1


In [8]:
ddf.head(2)

Unnamed: 0,a,b,c
0,0,19,0
1,1,18,1


In [9]:
df.sort_values(by='b')

Unnamed: 0,a,b,c
19,19,0,19
18,18,1,18
17,17,2,17
16,16,3,16
15,15,4,15
14,14,5,14
13,13,6,13
12,12,7,12
11,11,8,11
10,10,9,10


In [10]:
ddf.sort_values(by='b').compute()

Unnamed: 0,a,b,c
19,19,0,19
18,18,1,18
17,17,2,17
16,16,3,16
15,15,4,15
14,14,5,14
13,13,6,13
12,12,7,12
11,11,8,11
10,10,9,10


In [11]:
df['a']

0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
12    12
13    13
14    14
15    15
16    16
17    17
18    18
19    19
Name: a, dtype: int64

In [12]:
ddf['a'].compute()

0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
12    12
13    13
14    14
15    15
16    16
17    17
18    18
19    19
Name: a, dtype: int64

In [13]:
df.loc[2:5, ['a', 'b']]

Unnamed: 0,a,b
2,2,17
3,3,16
4,4,15
5,5,14


In [14]:
ddf.loc[2:5, ['a', 'b']].compute()

Unnamed: 0,a,b
2,2,17
3,3,16
4,4,15
5,5,14


In [15]:
df.iloc[0]

a     0
b    19
c     0
Name: 0, dtype: int64

In [16]:
df.iloc[0:3, 0:2]

Unnamed: 0,a,b
0,0,19
1,1,18
2,2,17


In [17]:
df[3:5]

Unnamed: 0,a,b,c
3,3,16,3
4,4,15,4


In [18]:
s[3:5]

3    <NA>
4       4
dtype: int64

In [19]:
df[df.b > 15]

Unnamed: 0,a,b,c
0,0,19,0
1,1,18,1
2,2,17,2
3,3,16,3


In [20]:
ddf[ddf.b > 15].compute()

Unnamed: 0,a,b,c
0,0,19,0
1,1,18,1
2,2,17,2
3,3,16,3


In [21]:
df.query("b == 3")

Unnamed: 0,a,b,c
16,16,3,16


In [22]:
ddf.query("b == 3").compute()

Unnamed: 0,a,b,c
16,16,3,16


In [23]:
cudf_comparator = 3
df.query("b == @cudf_comparator")


Unnamed: 0,a,b,c
16,16,3,16


In [24]:
dask_cudf_comparator = 3
ddf.query("b == @val", local_dict={'val':dask_cudf_comparator}).compute()

Unnamed: 0,a,b,c
16,16,3,16


In [25]:
df[df.a.isin([0, 5])]

Unnamed: 0,a,b,c
0,0,19,0
5,5,14,5


In [26]:
arrays = [['a', 'a', 'b', 'b'], [1, 2, 3, 4]]
tuples = list(zip(*arrays))
tuples

[('a', 1), ('a', 2), ('b', 3), ('b', 4)]

In [27]:
idx = cudf.MultiIndex.from_tuples(tuples)
idx

MultiIndex(levels=[0    a
1    b
dtype: object, 0    1
1    2
2    3
3    4
dtype: int64],
codes=   0  1
0  0  0
1  0  1
2  1  2
3  1  3)

In [28]:
gdf1 = cudf.DataFrame({'first': cp.random.rand(4), 'second': cp.random.rand(4)})
gdf1.index = idx
gdf1


Unnamed: 0,Unnamed: 1,first,second
a,1,0.840915,0.099584
a,2,0.501978,0.636936
b,3,0.067916,0.056704
b,4,0.956406,0.562991


In [29]:
gdf2 = cudf.DataFrame({'first': cp.random.rand(4), 'second': cp.random.rand(4)}).T
gdf2.columns = idx
gdf2

Unnamed: 0_level_0,a,a,b,b
Unnamed: 0_level_1,1,2,3,4
first,0.251515,0.90316,0.521797,0.679775
second,0.648363,0.263586,0.979282,0.527026


In [30]:
gdf1.loc[('b', 3)]

Unnamed: 0,Unnamed: 1,first,second
b,3,0.067916,0.056704


In [31]:
gdf1.loc[('a', 2)]

Unnamed: 0,Unnamed: 1,first,second
a,2,0.501978,0.636936


In [32]:
# Missing data can be replaced by using the fillna method.
s.fillna(999)

0      1
1      2
2      3
3    999
4      4
dtype: int64

In [33]:
ds.fillna(999).compute()

0      1
1      2
2      3
3    999
4      4
dtype: int64

In [34]:
# Calculating descriptive statistics
s.mean(), s.var()

(2.5, 1.666666666666666)

In [35]:
ds.mean().compute(), ds.var().compute()

(2.5, 1.6666666666666667)

In [36]:
def add_ten(num):
    return num + 10

df['a'].applymap(add_ten)

0     10
1     11
2     12
3     13
4     14
5     15
6     16
7     17
8     18
9     19
10    20
11    21
12    22
13    23
14    24
15    25
16    26
17    27
18    28
19    29
Name: a, dtype: int64

In [37]:
ddf['a'].map_partitions(add_ten).compute()

0     10
1     11
2     12
3     13
4     14
5     15
6     16
7     17
8     18
9     19
10    20
11    21
12    22
13    23
14    24
15    25
16    26
17    27
18    28
19    29
Name: a, dtype: int64

In [38]:
# Counting the number of occurrences of each unique value of variable.
df.a.value_counts()

0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
Name: a, dtype: int32

In [39]:
ddf.a.value_counts().compute()

0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
Name: a, dtype: int64

In [40]:
s = cudf.Series(['A', 'B', 'C', 'Aaba', 'Baca', None, 'CABA', 'dog', 'cat'])
s.str.lower()


0       a
1       b
2       c
3    aaba
4    baca
5    <NA>
6    caba
7     dog
8     cat
dtype: object

In [41]:
ds = dask_cudf.from_cudf(s, npartitions=2)
ds.str.lower().compute()


0       a
1       b
2       c
3    aaba
4    baca
5    <NA>
6    caba
7     dog
8     cat
dtype: object

In [42]:
# Concatenating Series and DataFrames row-wise.
s = cudf.Series([1, 2, 3, None, 5])
cudf.concat([s, s])

0       1
1       2
2       3
3    <NA>
4       5
0       1
1       2
2       3
3    <NA>
4       5
dtype: int64

In [43]:
ds2 = dask_cudf.from_cudf(s, npartitions=2)
dask_cudf.concat([ds2, ds2]).compute()


0       1
1       2
2       3
3    <NA>
4       5
0       1
1       2
2       3
3    <NA>
4       5
dtype: int64

In [44]:
# Left join example

df_a = cudf.DataFrame()
df_a['key'] = ['a', 'b', 'c', 'd', 'e']
df_a['vals_a'] = [float(i + 10) for i in range(5)]

df_b = cudf.DataFrame()
df_b['key'] = ['a', 'c', 'e']
df_b['vals_b'] = [float(i+100) for i in range(3)]

df_a

Unnamed: 0,key,vals_a
0,a,10.0
1,b,11.0
2,c,12.0
3,d,13.0
4,e,14.0


In [45]:
df_b

Unnamed: 0,key,vals_b
0,a,100.0
1,c,101.0
2,e,102.0


In [46]:
merged = df_a.merge(df_b, on=['key'], how='left')
merged

Unnamed: 0,key,vals_a,vals_b
0,a,10.0,100.0
1,c,12.0,101.0
2,e,14.0,102.0
3,b,11.0,
4,d,13.0,


In [47]:
ddf_a = dask_cudf.from_cudf(df_a, npartitions=2)
ddf_b = dask_cudf.from_cudf(df_b, npartitions=2)

merged = ddf_a.merge(ddf_b, on=['key'], how='left').compute()
merged


Unnamed: 0,key,vals_a,vals_b
0,a,10.0,100.0
1,c,12.0,101.0
2,b,11.0,
0,e,14.0,102.0
1,d,13.0,


In [48]:
# Appending values from another Series or array-like object.
s.append(s)

0       1
1       2
2       3
3    <NA>
4       5
0       1
1       2
2       3
3    <NA>
4       5
dtype: int64

In [49]:
ds2.append(ds2).compute()

0       1
1       2
2       3
3    <NA>
4       5
0       1
1       2
2       3
3    <NA>
4       5
dtype: int64

In [50]:
df['agg_col1'] = [1 if x % 2 == 0 else 0 for x in range(len(df))]
df['agg_col2'] = [1 if x % 3 == 0 else 0 for x in range(len(df))]
df

Unnamed: 0,a,b,c,agg_col1,agg_col2
0,0,19,0,1,1
1,1,18,1,0,0
2,2,17,2,1,0
3,3,16,3,0,1
4,4,15,4,1,0
5,5,14,5,0,0
6,6,13,6,1,1
7,7,12,7,0,0
8,8,11,8,1,0
9,9,10,9,0,1


In [51]:
df.groupby('agg_col1').sum()

Unnamed: 0_level_0,a,b,c,agg_col2
agg_col1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,100,90,100,3
1,90,100,90,4


In [52]:
ddf = dask_cudf.from_cudf(df, npartitions=2)
df.groupby('agg_col1').sum()

Unnamed: 0_level_0,a,b,c,agg_col2
agg_col1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,100,90,100,3
1,90,100,90,4


In [53]:
df.groupby(['agg_col1', 'agg_col2']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
agg_col1,agg_col2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,73,60,73
0,1,27,30,27
1,0,54,60,54
1,1,36,40,36


In [54]:
ddf.groupby(['agg_col1', 'agg_col2']).sum().compute()

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
agg_col1,agg_col2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,36,40,36
0,0,73,60,73
1,0,54,60,54
0,1,27,30,27


In [55]:
df.groupby('agg_col1').agg({'a':'max', 'b':'mean', 'c':'sum'})

Unnamed: 0_level_0,a,b,c
agg_col1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,19,9.0,100
1,18,10.0,90


In [56]:
ddf.groupby('agg_col1').agg({'a':'max', 'b':'mean', 'c':'sum'}).compute()

Unnamed: 0_level_0,a,b,c
agg_col1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,18,10.0,90
0,19,9.0,100


In [57]:
#Transpose
sample = cudf.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
sample


Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6


In [58]:
sample.transpose()

Unnamed: 0,0,1,2
a,1,2,3
b,4,5,6


In [59]:
import datetime as dt

date_df = cudf.DataFrame()
date_df['date'] = pd.date_range('11/20/2018', periods=72, freq='D')
date_df['value'] = cp.random.sample(len(date_df))
date_df

Unnamed: 0,date,value
0,2018-11-20,0.374472
1,2018-11-21,0.826395
2,2018-11-22,0.099401
3,2018-11-23,0.341341
4,2018-11-24,0.161186
...,...,...
67,2019-01-26,0.489820
68,2019-01-27,0.826598
69,2019-01-28,0.974970
70,2019-01-29,0.266897


In [60]:
search_date = dt.datetime.strptime('2018-11-23', '%Y-%m-%d')
date_df.query('date <= @search_date')

Unnamed: 0,date,value
0,2018-11-20,0.374472
1,2018-11-21,0.826395
2,2018-11-22,0.099401
3,2018-11-23,0.341341


In [61]:
date_ddf = dask_cudf.from_cudf(date_df, npartitions=2)
date_ddf.query('date <= @search_date', local_dict={'search_date':search_date}).compute()

Unnamed: 0,date,value
0,2018-11-20,0.374472
1,2018-11-21,0.826395
2,2018-11-22,0.099401
3,2018-11-23,0.341341


In [62]:
gdf = cudf.DataFrame({"id": [1, 2, 3, 4, 5, 6], "grade":['a', 'b', 'b', 'a', 'a', 'e']})
gdf['grade'] = gdf['grade'].astype('category')
gdf


Unnamed: 0,id,grade
0,1,a
1,2,b
2,3,b
3,4,a
4,5,a
5,6,e


In [63]:
dgdf = dask_cudf.from_cudf(gdf, npartitions=2)
dgdf.compute()


Unnamed: 0,id,grade
0,1,a
1,2,b
2,3,b
3,4,a
4,5,a
5,6,e


In [64]:
gdf.grade.cat.categories

StringIndex(['a' 'b' 'e'], dtype='object')

In [65]:
gdf.grade.cat.codes

0    0
1    1
2    1
3    0
4    0
5    2
dtype: uint8

In [66]:
dgdf.grade.cat.codes.compute()

0    0
1    1
2    1
3    0
4    0
5    2
dtype: uint8

In [67]:
#Converting a cuDF and Dask-cuDF DataFrame to a pandas DataFrame.
df.head()

Unnamed: 0,a,b,c,agg_col1,agg_col2
0,0,19,0,1,1
1,1,18,1,0,0
2,2,17,2,1,0
3,3,16,3,0,1
4,4,15,4,1,0


In [68]:
df.head().to_pandas()

Unnamed: 0,a,b,c,agg_col1,agg_col2
0,0,19,0,1,1
1,1,18,1,0,0
2,2,17,2,1,0
3,3,16,3,0,1
4,4,15,4,1,0


In [69]:
ddf.compute().head().to_pandas()

Unnamed: 0,a,b,c,agg_col1,agg_col2
0,0,19,0,1,1
1,1,18,1,0,0
2,2,17,2,1,0
3,3,16,3,0,1
4,4,15,4,1,0


In [70]:
# Converting a cuDF or Dask-cuDF DataFrame to a numpy ndarray.
df.as_matrix()

array([[ 0, 19,  0,  1,  1],
       [ 1, 18,  1,  0,  0],
       [ 2, 17,  2,  1,  0],
       [ 3, 16,  3,  0,  1],
       [ 4, 15,  4,  1,  0],
       [ 5, 14,  5,  0,  0],
       [ 6, 13,  6,  1,  1],
       [ 7, 12,  7,  0,  0],
       [ 8, 11,  8,  1,  0],
       [ 9, 10,  9,  0,  1],
       [10,  9, 10,  1,  0],
       [11,  8, 11,  0,  0],
       [12,  7, 12,  1,  1],
       [13,  6, 13,  0,  0],
       [14,  5, 14,  1,  0],
       [15,  4, 15,  0,  1],
       [16,  3, 16,  1,  0],
       [17,  2, 17,  0,  0],
       [18,  1, 18,  1,  1],
       [19,  0, 19,  0,  0]])

In [71]:
ddf.compute().as_matrix()

array([[ 0, 19,  0,  1,  1],
       [ 1, 18,  1,  0,  0],
       [ 2, 17,  2,  1,  0],
       [ 3, 16,  3,  0,  1],
       [ 4, 15,  4,  1,  0],
       [ 5, 14,  5,  0,  0],
       [ 6, 13,  6,  1,  1],
       [ 7, 12,  7,  0,  0],
       [ 8, 11,  8,  1,  0],
       [ 9, 10,  9,  0,  1],
       [10,  9, 10,  1,  0],
       [11,  8, 11,  0,  0],
       [12,  7, 12,  1,  1],
       [13,  6, 13,  0,  0],
       [14,  5, 14,  1,  0],
       [15,  4, 15,  0,  1],
       [16,  3, 16,  1,  0],
       [17,  2, 17,  0,  0],
       [18,  1, 18,  1,  1],
       [19,  0, 19,  0,  0]])

In [72]:
df['a'].to_array()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [73]:
ddf['a'].compute().to_array()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [74]:
#Converting a cuDF or Dask-cuDF DataFrame to a PyArrow Table.
df.to_arrow()

pyarrow.Table
a: int64
b: int64
c: int64
agg_col1: int64
agg_col2: int64

In [75]:
ddf.compute().to_arrow()

pyarrow.Table
a: int64
b: int64
c: int64
agg_col1: int64
agg_col2: int64
index: int64

In [76]:
if not os.path.exists('example_output'):
    os.mkdir('example_output')

df.to_csv('example_output/foo.csv', index=False)


In [77]:
ddf.compute().to_csv('example_output/foo_dask.csv', index=False)

In [78]:
df = cudf.read_csv('example_output/foo.csv')
df


Unnamed: 0,a,b,c,agg_col1,agg_col2
0,0,19,0,1,1
1,1,18,1,0,0
2,2,17,2,1,0
3,3,16,3,0,1
4,4,15,4,1,0
5,5,14,5,0,0
6,6,13,6,1,1
7,7,12,7,0,0
8,8,11,8,1,0
9,9,10,9,0,1


In [79]:
ddf = dask_cudf.read_csv('example_output/foo_dask.csv')
ddf.compute()

Unnamed: 0,a,b,c,agg_col1,agg_col2
0,0,19,0,1,1
1,1,18,1,0,0
2,2,17,2,1,0
3,3,16,3,0,1
4,4,15,4,1,0
5,5,14,5,0,0
6,6,13,6,1,1
7,7,12,7,0,0
8,8,11,8,1,0
9,9,10,9,0,1


In [80]:
ddf = dask_cudf.read_csv('example_output/*.csv')
ddf.compute()


Unnamed: 0,a,b,c,agg_col1,agg_col2
0,0,19,0,1,1
1,1,18,1,0,0
2,2,17,2,1,0
3,3,16,3,0,1
4,4,15,4,1,0
5,5,14,5,0,0
6,6,13,6,1,1
7,7,12,7,0,0
8,8,11,8,1,0
9,9,10,9,0,1


In [81]:
# Writing to parquet files, using the CPU via PyArrow.
df.to_parquet('example_output/temp_parquet')

In [82]:
# Reading parquet files with a GPU-accelerated parquet reader.
df = cudf.read_parquet('example_output/temp_parquet')
df


Unnamed: 0,a,b,c,agg_col1,agg_col2
0,0,19,0,1,1
1,1,18,1,0,0
2,2,17,2,1,0
3,3,16,3,0,1
4,4,15,4,1,0
5,5,14,5,0,0
6,6,13,6,1,1
7,7,12,7,0,0
8,8,11,8,1,0
9,9,10,9,0,1


In [83]:
# Writing to parquet files from a dask_cudf.DataFrame using PyArrow under the hood.
ddf.to_parquet('example_files')

In [84]:
# Write to ORC
ddf.to_orc('example_files_orc')

('/jupyterhub-homes/2044/example_files_orc/part.0.orc',
 '/jupyterhub-homes/2044/example_files_orc/part.1.orc')

In [85]:
# Read a ORC
df2 = cudf.read_orc('example_files_orc/part.0.orc')
df2

Unnamed: 0,index,a,b,c,agg_col1,agg_col2
0,0,0,19,0,1,1
1,1,1,18,1,0,0
2,2,2,17,2,1,0
3,3,3,16,3,0,1
4,4,4,15,4,1,0
5,5,5,14,5,0,0
6,6,6,13,6,1,1
7,7,7,12,7,0,0
8,8,8,11,8,1,0
9,9,9,10,9,0,1
