In [1]:
import pandas as pd
import numpy as np
from itertools import product
import turicreate
from turicreate import SFrame, SArray
from sklearn.utils.extmath import cartesian

### **Task**: What is the fastest way to compute cross product and create a dataframe with two arrays x,y having xsize=100, ysize=100000 --> 10000000 (10Million rows).

### **1. Using Pandas `Merge`**

In [2]:
def pandas_merge(xsize, ysize):
    dfx = pd.DataFrame({'x':list(range(xsize))})
    dfy = pd.DataFrame({'y':list(range(ysize))})

    dfx['temp'] = 0
    dfy['temp'] = 0
    
    df_final = pd.merge(dfx, dfy, on=['temp'], how='outer').drop('temp', axis=1)
    return df_final

### **2. Using Pandas `Stack`**

In [3]:
def pandas_stack(xsize, ysize):
    dfx = pd.DataFrame({'x':list(range(xsize))})
    dfy = pd.DataFrame({'y':list(range(ysize))})
    df = (pd.DataFrame([dfx.x.tolist()], index=[dfy.y])
           .stack()
           .reset_index(level=1, drop=True)
           .reset_index(name='x'))
    return df

### **3. Using Itertools `product`**

In [4]:
def itertools_product(xsize, ysize):
    x = list(range(xsize))
    y = list(range(ysize))
    data = list(product(x, y))
    df = pd.DataFrame(data=data, columns=['x', 'y'])
    return df

### **4. Using Numpy `np.tile` adn `np.repeat`** 

In [5]:
def numpy_tile_repeat(xsize, ysize):
    x = list(range(xsize))
    y = list(range(ysize))
    xx = np.tile(x, len(y))
    yy = np.repeat(y, len(x))
    df =  pd.DataFrame({'x': xx, 'y':yy})
    return df

### **5. Using Numpy `dstack` and `meshgrid`**

In [6]:
def numpy_dstack_meshgrid(xsize, ysize):
    x = list(range(xsize))
    y = list(range(ysize))
    data = np.dstack(np.meshgrid(x, y)).reshape(-1, 2)
    df = pd.DataFrame(data= data, columns=['x', 'y'])
    return df

### **6. Using Turicreate join**

In [7]:
def turicreate_join(xsize, ysize):
    x = SArray(list(range(xsize)))
    y = SArray(list(range(ysize)))

    sfx = SFrame({'x':x})
    sfy = SFrame({'y':y})

    sfx = sfx.add_column(0, 'temp')
    sfy = sfy.add_column(0, 'temp')

    sf_final = sfx.join(sfy, on=['temp'], how='outer').remove_column('temp')
#     df = sf_final.to_dataframe()
    return sf_final

### **7. Using `sklearn.utils.extmath.cartesian`**

In [8]:
def sklearn_cartesian(xsize, ysize):
    x = list(range(xsize))
    y = list(range(ysize))
    data = cartesian((x,y))
    df = pd.DataFrame(data=data, columns=['x', 'y'])
    return df

# Benchmarking the time taken

In [9]:
xsize=100
ysize=100000

In [10]:
%%time
df = pandas_merge(xsize,ysize)

CPU times: user 459 ms, sys: 351 ms, total: 810 ms
Wall time: 817 ms


In [11]:
%%time
df = pandas_stack(xsize,ysize)

CPU times: user 323 ms, sys: 236 ms, total: 559 ms
Wall time: 565 ms


In [12]:
%%time
df = itertools_product(xsize,ysize)

CPU times: user 5.42 s, sys: 876 ms, total: 6.3 s
Wall time: 6.35 s


In [13]:
%%time
df = numpy_tile_repeat(xsize,ysize)

CPU times: user 82.9 ms, sys: 108 ms, total: 191 ms
Wall time: 191 ms


In [14]:
%%time
df = numpy_dstack_meshgrid(xsize, ysize)

CPU times: user 105 ms, sys: 55.2 ms, total: 160 ms
Wall time: 165 ms


In [15]:
%%time
df = sklearn_cartesian(xsize, ysize)

CPU times: user 75.2 ms, sys: 116 ms, total: 191 ms
Wall time: 192 ms


# Results 
Machine details: 4GB Ram, 1core. Below is the order of the methods from fastest to slowest
1. **`numpy_dstack_meshgrid` (rank1) (165ms)**
2. **`numpy_tile_repeat` (rank2) (191ms)**
3. **`sklearn_cartesian` (rank2)(192ms)**
4. **`pandas_stack` (rank3)(565ms)**
5. **`pandas_merge` (rank4)(817ms)**
6. **`itertools_product` (rank5)(6.35s)**