In [1]:
import pandas as pd
import numpy as np

from numpy.random import random_integers

In [2]:
N    = 10000
K    =  1000
recs = 40000

def get_sample_df(N = N, K = K, recs = recs): 
    df = pd.DataFrame({
            'u': random_integers(low = 0, high = N, size = recs),
            'i': random_integers(low = 0, high = K, size = recs)
            })
    return df

def get_dummied(df, sparse = True):
    d = pd.get_dummies(df, columns=["i"], sparse=sparse)
    print (d.shape)
    return d

In [None]:
N    = 10000
K    =  1000
recs = 40000

tests = [
    [1, 1, 1],
    [10, 1, 1],
    [1, 10, 1],
    [1, 1, 10],
    [10, 1, 10]
]

for t in tests:
    print (t)
    s = get_sample_df(*list(np.array([N, K, recs]) * t))
    print(s.shape)
    %time get_dummied(s)
    

[1, 1, 1]
(40000, 2)
(40000, 1002)
CPU times: user 3.42 s, sys: 17.8 ms, total: 3.44 s
Wall time: 3.45 s
[10, 1, 1]
(40000, 2)


In [5]:
for t in tests:
    print (t)
    s = get_sample_df(*list(np.array([N, K, recs]) * t))
    print(s.shape)
    %time get_dummied(s, sparse=False)

[1, 1, 1]
(40000, 2)
(40000, 1002)
CPU times: user 265 ms, sys: 47.6 ms, total: 312 ms
Wall time: 312 ms
[10, 1, 1]
(40000, 2)
(40000, 1002)
CPU times: user 262 ms, sys: 47.3 ms, total: 310 ms
Wall time: 310 ms
[1, 10, 1]
(40000, 2)
(40000, 9806)
CPU times: user 2.93 s, sys: 2.16 s, total: 5.09 s
Wall time: 5.11 s
[1, 1, 10]
(400000, 2)
(400000, 1002)
CPU times: user 6.46 s, sys: 1.81 s, total: 8.27 s
Wall time: 8.3 s
[10, 1, 10]
(400000, 2)
(400000, 1002)
CPU times: user 6.47 s, sys: 1.94 s, total: 8.42 s
Wall time: 8.44 s


In [6]:
from scipy.sparse import csr_matrix

def get_csr(df):
    d = csr_matrix((np.ones(len(df)), (df.u, df.i)))
    print (d.shape)
    return d

In [7]:
get_csr(get_sample_df(4, 3, 10)).toarray()

(5, 4)


array([[ 0.,  0.,  0.,  0.],
       [ 2.,  1.,  2.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  2.],
       [ 1.,  1.,  0.,  0.]])

In [8]:
N    = 10000
K    =  1000
recs = 40000

for t in tests:
    print (t)
    s = get_sample_df(*list(np.array([N, K, recs]) * t))
    print(s.shape)
    %time get_csr(s)

[1, 1, 1]
(40000, 2)
(10001, 1001)
CPU times: user 2.29 ms, sys: 2 µs, total: 2.29 ms
Wall time: 2.3 ms
[10, 1, 1]
(40000, 2)
(99999, 1001)
CPU times: user 3.38 ms, sys: 86 µs, total: 3.47 ms
Wall time: 3.37 ms
[1, 10, 1]
(40000, 2)
(10001, 10001)
CPU times: user 2.23 ms, sys: 6 µs, total: 2.23 ms
Wall time: 2.23 ms
[1, 1, 10]
(400000, 2)
(10001, 1001)
CPU times: user 20.8 ms, sys: 45 µs, total: 20.9 ms
Wall time: 20.9 ms
[10, 1, 10]
(400000, 2)
(100001, 1001)
CPU times: user 19.5 ms, sys: 42 µs, total: 19.6 ms
Wall time: 19.6 ms


In [9]:
N    = 500000
K    =  10000
recs = 4000000

for t in tests:
    print (t)
    s = get_sample_df(*list(np.array([N, K, recs]) * t))
    print(s.shape)
    %time get_csr(s)

[1, 1, 1]
(4000000, 2)
(500001, 10001)
CPU times: user 247 ms, sys: 3.4 ms, total: 250 ms
Wall time: 250 ms
[10, 1, 1]
(4000000, 2)
(5000001, 10001)
CPU times: user 513 ms, sys: 5.39 ms, total: 519 ms
Wall time: 520 ms
[1, 10, 1]
(4000000, 2)
(500001, 100001)
CPU times: user 314 ms, sys: 1.48 ms, total: 316 ms
Wall time: 316 ms
[1, 1, 10]
(40000000, 2)
(500001, 10001)
CPU times: user 2.76 s, sys: 114 ms, total: 2.87 s
Wall time: 2.88 s
[10, 1, 10]
(40000000, 2)
(5000001, 10001)
CPU times: user 4.75 s, sys: 113 ms, total: 4.86 s
Wall time: 4.87 s


In [10]:
m = get_csr(get_sample_df(N, K, recs))
%time m+m

(500001, 10001)
CPU times: user 22.1 ms, sys: 1.78 ms, total: 23.9 ms
Wall time: 24 ms


<500001x10001 sparse matrix of type '<type 'numpy.float64'>'
	with 3998406 stored elements in Compressed Sparse Row format>

In [25]:
foo = np.array([random_integers(low=-1, high=1, size=N+1)])
print(foo.shape)
print(m.shape)
%timeit foo * m

(1, 500001)
(500001, 10001)
100 loops, best of 3: 8.2 ms per loop


In [12]:
from numba import jit

get_csr2 = jit(get_csr)

N    = 500000
K    =  10000
recs = 4000000

for t in tests:
    print (t)
    s = get_sample_df(*list(np.array([N, K, recs]) * t))
    print(s.shape)
    %timeit get_csr2(s)

[1, 1, 1]
(4000000, 2)
(500001, 10001)

(500001, 10001)

(500001, 10001)

(500001, 10001)

1 loops, best of 3: 243 ms per loop
[10, 1, 1]
(4000000, 2)
(5000001, 10001)

(5000001, 10001)

(5000001, 10001)

(5000001, 10001)

1 loops, best of 3: 579 ms per loop
[1, 10, 1]
(4000000, 2)
(500001, 100001)

(500001, 100001)

(500001, 100001)

(500001, 100001)

1 loops, best of 3: 281 ms per loop
[1, 1, 10]
(40000000, 2)
(500001, 10001)

(500001, 10001)

(500001, 10001)

(500001, 10001)

1 loops, best of 3: 2.83 s per loop
[10, 1, 10]
(40000000, 2)
(5000001, 10001)

(5000001, 10001)

(5000001, 10001)

(5000001, 10001)

1 loops, best of 3: 4.85 s per loop




In [26]:
@jit
def test1():
    return foo * m

%timeit test1()

The slowest run took 4.29 times longer than the fastest. This could mean that an intermediate result is being cached 
100 loops, best of 3: 8.22 ms per loop


In [24]:
%time m.tocsc()
n=m.tocsc()
print(repr(m), repr(n))

CPU times: user 77.5 ms, sys: 606 µs, total: 78.1 ms
Wall time: 77.8 ms
("<500001x10001 sparse matrix of type '<type 'numpy.float64'>'\n\twith 3998406 stored elements in Compressed Sparse Row format>", "<500001x10001 sparse matrix of type '<type 'numpy.float64'>'\n\twith 3998406 stored elements in Compressed Sparse Column format>")
