# simple pandas vs pandas: DataFrame

In [1]:
from spandas import DataFrame
import pandas as pd
import time
import numpy as np
import math

## Append

In [2]:
df = pd.DataFrame({'a': [], 'b': []})
df_append = pd.DataFrame({'a': [1], 'b': [1]})

In [3]:
start = time.time()
for i in range(1000):
    df = df.append(df_append)
end = time.time()
time_df = end - start

In [4]:
fdf = DataFrame({'a': [], 'b': []})
fdf_append = DataFrame({'a': [1], 'b': [1]})

In [5]:
start = time.time()
for i in range(1000):
    fdf = fdf.append(fdf_append)
end = time.time()
time_fdf = end - start

In [6]:
assert(len(fdf) == len(df))
print("pandas time: {:.5}, simple pandas time: {:.5}".format(time_df, time_fdf))
print("boost times: {:.5}".format(time_df / time_fdf))

pandas time: 0.54956, simple pandas time: 0.020974
boost times: 26.202


In [7]:
start = time.time()
df = df.append([df_append for _ in range(10000)])
end = time.time()
time_df = end - start

In [8]:
start = time.time()
fdf = fdf.append([fdf_append for _ in range(10000)])
end = time.time()
time_fdf = end - start

In [9]:
assert(len(fdf) == len(df))
print("pandas time: {:.5}, simple pandas time: {:.5}".format(time_df, time_fdf))
print("boost times: {:.5}".format(time_df / time_fdf))

pandas time: 0.6971, simple pandas time: 0.028922
boost times: 24.103


## iterrows

In [10]:
d = {'a': np.zeros(10000), 'b': np.zeros(10000), 'c': np.zeros(10000)}
df = pd.DataFrame(d)
fdf = DataFrame(d)

In [11]:
start = time.time()
for k, v in df.iterrows():
    pass
end = time.time()
time_df = end - start

In [12]:
start = time.time()
for k, v in fdf.iterrows():
    pass
end = time.time()
time_fdf = end - start

In [13]:
print("pandas time: {:.5}, simple pandas time: {:.5}".format(time_df, time_fdf))
print("boost times: {:.5}".format(time_df / time_fdf))

pandas time: 0.42995, simple pandas time: 0.011965
boost times: 35.935


## apply

In [14]:
d = {'math': np.random.randint(50, 100, size=100000), 
     'chinese': np.random.randint(50, 100, size=100000), 
     'english': np.random.randint(50, 100, size=100000)}
df = pd.DataFrame(d)
fdf = DataFrame(d)

### element-wise

In [15]:
start = time.time()
tmp = df.applymap(lambda x: math.sqrt(x)*10)
end = time.time()
time_df = end - start

In [16]:
start = time.time()
tmp = fdf.apply(lambda x: math.sqrt(x)*10)
end = time.time()
time_df = end - start

In [17]:
print("pandas time: {:.5}, simple pandas time: {:.5}".format(time_df, time_fdf))
print("boost times: {:.5}".format(time_df / time_fdf))

pandas time: 0.10568, simple pandas time: 0.011965
boost times: 8.8329


during the test, something insteresting happens:

In [18]:
start = time.time()
tmp = df.applymap(lambda x: np.sqrt(x)*10)
end = time.time()
time_df = end - start

In [19]:
start = time.time()
tmp = fdf.apply(lambda x: np.sqrt(x)*10)
end = time.time()
time_df = end - start

In [20]:
print("pandas time: {:.5}, simple pandas time: {:.5}".format(time_df, time_fdf))
print("boost times: {:.5}".format(time_df / time_fdf))

pandas time: 0.50967, simple pandas time: 0.011965
boost times: 42.597


for numpy function, pandas will take more time.

### row-wise

In [21]:
start = time.time()
df['sum'] = df.apply(lambda x: x['math'] +x['chinese'] + x['english'], axis=1)
end = time.time()
time_df = end - start

In [22]:
start = time.time()
fdf[['sum']] = fdf.apply(lambda x: x['math'] +x['chinese'] + x['english'], type='row')
end = time.time()
time_fdf = end - start

In [23]:
print("pandas time: {:.5}, simple pandas time: {:.5}".format(time_df, time_fdf))
print("boost times: {:.5}".format(time_df / time_fdf))

pandas time: 2.8633, simple pandas time: 0.15658
boost times: 18.286


### column-wise

In [24]:
start = time.time()
for i in range(100):
    tmp = df.apply(np.mean, axis=0)
end = time.time()
time_df = end - start
time_df

0.39696621894836426

In [25]:
start = time.time()
for i in range(100):
    tmp = fdf.apply(np.mean, type='column')
end = time.time()
time_fdf = end - start
time_fdf

0.05089926719665527

In [26]:
print("pandas time: {:.5}, simple pandas time: {:.5}".format(time_df, time_fdf))
print("boost times: {:.5}".format(time_df / time_fdf))

pandas time: 0.39697, simple pandas time: 0.050899
boost times: 7.7991


## Sample

In [27]:
d = {'math': np.random.randint(50, 100, size=1000000), 
     'chinese': np.random.randint(50, 100, size=1000000), 
     'english': np.random.randint(50, 100, size=1000000)}
df = pd.DataFrame(d)
fdf = DataFrame(d)

In [28]:
start = time.time()
tmp = df.sample(10000)
end = time.time()
time_df = end - start
time_df

0.0389399528503418

In [29]:
start = time.time()
tmp = fdf.sample(10000)
end = time.time()
time_fdf = end - start
time_fdf

0.010972023010253906

In [30]:
len(tmp)

10000

In [31]:
print("pandas time: {:.5}, simple pandas time: {:.5}".format(time_df, time_fdf))
print("boost times: {:.5}".format(time_df / time_fdf))

pandas time: 0.03894, simple pandas time: 0.010972
boost times: 3.549
