In [1]:
import pandas as pd
import numpy as np

from numpy.random import random_integers

In [2]:
N    = 10000
K    =  1000
recs = 40000

def get_sample_df(N = N, K = K, recs = recs): 
    df = pd.DataFrame({
            'u': random_integers(low = 0, high = N, size = recs),
            'i': random_integers(low = 0, high = K, size = recs)
            })
    return df

def get_dummied(df, sparse = True):
    d = pd.get_dummies(df, columns=["i"], sparse=sparse)
    print (d.shape)
    return d

In [3]:
N    = 10000
K    =  1000
recs = 40000

tests = [
    [1, 1, 1],
    [10, 1, 1],
    [1, 10, 1],
    [1, 1, 10],
    [10, 1, 10]
]

for t in tests:
    print (t)
    s = get_sample_df(*list(np.array([N, K, recs]) * t))
    print(s.shape)
    %time get_dummied(s)
    

[1, 1, 1]
(40000, 2)
(40000, 1002)
CPU times: user 3.07 s, sys: 17.8 ms, total: 3.08 s
Wall time: 3.08 s
[10, 1, 1]
(40000, 2)
(40000, 1002)
CPU times: user 3.12 s, sys: 5.36 ms, total: 3.13 s
Wall time: 3.13 s
[1, 10, 1]
(40000, 2)
(40000, 9803)
CPU times: user 4min 10s, sys: 1.94 s, total: 4min 12s
Wall time: 4min 12s
[1, 1, 10]
(400000, 2)
(400000, 1002)
CPU times: user 4.3 s, sys: 13.5 ms, total: 4.31 s
Wall time: 4.31 s
[10, 1, 10]
(400000, 2)
(400000, 1002)
CPU times: user 4.32 s, sys: 12.7 ms, total: 4.34 s
Wall time: 4.33 s


In [4]:
for t in tests:
    print (t)
    s = get_sample_df(*list(np.array([N, K, recs]) * t))
    print(s.shape)
    %time get_dummied(s, sparse=False)

[1, 1, 1]
(40000, 2)
(40000, 1002)
CPU times: user 269 ms, sys: 48.4 ms, total: 318 ms
Wall time: 318 ms
[10, 1, 1]
(40000, 2)
(40000, 1002)
CPU times: user 267 ms, sys: 47.9 ms, total: 315 ms
Wall time: 315 ms
[1, 10, 1]
(40000, 2)
(40000, 9831)
CPU times: user 2.7 s, sys: 1.51 s, total: 4.21 s
Wall time: 4.42 s
[1, 1, 10]
(400000, 2)
(400000, 1002)
CPU times: user 6.41 s, sys: 1.77 s, total: 8.18 s
Wall time: 9.86 s
[10, 1, 10]
(400000, 2)
(400000, 1002)
CPU times: user 6.39 s, sys: 1.45 s, total: 7.83 s
Wall time: 7.85 s


In [5]:
from scipy.sparse import csr_matrix

def get_csr(df):
    d = csr_matrix((np.ones(len(df)), (df.u, df.i)))
    print (d.shape)
    return d

In [6]:
get_csr(get_sample_df(4, 3, 10)).toarray()

(5, 4)


array([[ 1.,  0.,  0.,  0.],
       [ 0.,  2.,  0.,  0.],
       [ 1.,  2.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  1.,  2.,  1.]])

In [7]:
N    = 10000
K    =  1000
recs = 40000

for t in tests:
    print (t)
    s = get_sample_df(*list(np.array([N, K, recs]) * t))
    print(s.shape)
    %time get_csr(s)

[1, 1, 1]
(40000, 2)
(10001, 1001)
CPU times: user 2.28 ms, sys: 13 µs, total: 2.29 ms
Wall time: 2.32 ms
[10, 1, 1]
(40000, 2)
(99997, 1001)
CPU times: user 3.58 ms, sys: 155 µs, total: 3.74 ms
Wall time: 3.58 ms
[1, 10, 1]
(40000, 2)
(10001, 10000)
CPU times: user 2.19 ms, sys: 2 µs, total: 2.19 ms
Wall time: 2.2 ms
[1, 1, 10]
(400000, 2)
(10001, 1001)
CPU times: user 22.7 ms, sys: 60 µs, total: 22.7 ms
Wall time: 22.8 ms
[10, 1, 10]
(400000, 2)
(100001, 1001)
CPU times: user 19.7 ms, sys: 50 µs, total: 19.7 ms
Wall time: 19.8 ms


In [8]:
N    = 500000
K    =  10000
recs = 4000000

for t in tests:
    print (t)
    s = get_sample_df(*list(np.array([N, K, recs]) * t))
    print(s.shape)
    %time get_csr(s)

[1, 1, 1]
(4000000, 2)
(500001, 10001)
CPU times: user 214 ms, sys: 4.88 ms, total: 219 ms
Wall time: 219 ms
[10, 1, 1]
(4000000, 2)
(5000001, 10001)
CPU times: user 508 ms, sys: 5.98 ms, total: 513 ms
Wall time: 515 ms
[1, 10, 1]
(4000000, 2)
(500001, 100001)
CPU times: user 214 ms, sys: 2.04 ms, total: 216 ms
Wall time: 216 ms
[1, 1, 10]
(40000000, 2)
(500001, 10001)
CPU times: user 2.73 s, sys: 111 ms, total: 2.84 s
Wall time: 2.85 s
[10, 1, 10]
(40000000, 2)
(5000001, 10001)
CPU times: user 4.58 s, sys: 112 ms, total: 4.7 s
Wall time: 4.7 s


In [12]:
m = get_csr(get_sample_df(N, K, recs))
%time m+m

(500001, 10001)
CPU times: user 21.9 ms, sys: 51 µs, total: 21.9 ms
Wall time: 22 ms


<500001x10001 sparse matrix of type '<type 'numpy.float64'>'
	with 3998377 stored elements in Compressed Sparse Row format>

In [19]:
foo = np.array([random_integers(low=-1, high=1, size=N+1)])
print(foo.shape)
print(m.shape)
%time foo * m

(1, 500001)
(500001, 10001)
CPU times: user 8.36 ms, sys: 152 µs, total: 8.51 ms
Wall time: 8.23 ms


array([[  3., -11., -15., ..., -10.,   2., -17.]])

In [21]:
from numba import jit

get_csr2 = jit(get_csr)

N    = 500000
K    =  10000
recs = 4000000

for t in tests:
    print (t)
    s = get_sample_df(*list(np.array([N, K, recs]) * t))
    print(s.shape)
    %timeit get_csr2(s)

[1, 1, 1]
(4000000, 2)
(500001, 10001)

(500001, 10001)

(500001, 10001)

(500001, 10001)

1 loops, best of 3: 225 ms per loop
[10, 1, 1]
(4000000, 2)
(5000001, 10001)

(5000001, 10001)

(5000001, 10001)

(5000001, 10001)

1 loops, best of 3: 338 ms per loop
[1, 10, 1]
(4000000, 2)
(500001, 100001)

(500001, 100001)

(500001, 100001)

(500001, 100001)

1 loops, best of 3: 226 ms per loop
[1, 1, 10]
(40000000, 2)
(500001, 10001)

(500001, 10001)

(500001, 10001)

(500001, 10001)

1 loops, best of 3: 2.82 s per loop
[10, 1, 10]
(40000000, 2)
(5000001, 10001)

(5000001, 10001)

(5000001, 10001)

(5000001, 10001)

1 loops, best of 3: 3.4 s per loop


In [22]:
@jit
def test1():
    return foo * m

%timeit test1()

The slowest run took 5.42 times longer than the fastest. This could mean that an intermediate result is being cached 
100 loops, best of 3: 8.09 ms per loop
