# simple pandas vs pandas: DataFrame

In [1]:
from spandas import DataFrame
import pandas as pd
import time
import numpy as np
import math

## Append

In [2]:
df = pd.DataFrame({'a': [], 'b': []})
df_append = pd.DataFrame({'a': [1], 'b': [1]})

In [3]:
start = time.time()
for i in range(1000):
    df = df.append(df_append)
end = time.time()
time_df = end - start

In [4]:
fdf = DataFrame({'a': [], 'b': []})
fdf_append = DataFrame({'a': [1], 'b': [1]})

In [5]:
start = time.time()
for i in range(1000):
    fdf = fdf.append(fdf_append)
end = time.time()
time_fdf = end - start

In [6]:
assert(len(fdf) == len(df))
print("pandas time: {:.5}, simple pandas time: {:.5}".format(time_df, time_fdf))
print("boost times: {:.5}".format(time_df / time_fdf))

pandas time: 0.47377, simple pandas time: 0.018957
boost times: 24.992


In [7]:
start = time.time()
df = df.append([df_append for _ in range(10000)])
end = time.time()
time_df = end - start

In [8]:
start = time.time()
fdf = fdf.append([fdf_append for _ in range(10000)])
end = time.time()
time_fdf = end - start

In [9]:
assert(len(fdf) == len(df))
print("pandas time: {:.5}, simple pandas time: {:.5}".format(time_df, time_fdf))
print("boost times: {:.5}".format(time_df / time_fdf))

pandas time: 0.59743, simple pandas time: 0.027949
boost times: 21.376


## iterrows

In [10]:
d = {'a': np.zeros(10000), 'b': np.zeros(10000), 'c': np.zeros(10000)}
df = pd.DataFrame(d)
fdf = DataFrame(d)

In [11]:
start = time.time()
for k, v in df.iterrows():
    pass
end = time.time()
time_df = end - start

In [12]:
start = time.time()
for k, v in fdf.iterrows():
    pass
end = time.time()
time_fdf = end - start

In [13]:
print("pandas time: {:.5}, simple pandas time: {:.5}".format(time_df, time_fdf))
print("boost times: {:.5}".format(time_df / time_fdf))

pandas time: 0.39298, simple pandas time: 0.010004
boost times: 39.281


## apply

In [14]:
d = {'math': np.random.randint(50, 100, size=100000), 
     'chinese': np.random.randint(50, 100, size=100000), 
     'english': np.random.randint(50, 100, size=100000)}
df = pd.DataFrame(d)
fdf = DataFrame(d)

### element-wise

In [15]:
start = time.time()
tmp = df.applymap(lambda x: math.sqrt(x)*10)
end = time.time()
time_df = end - start

In [16]:
start = time.time()
tmp = fdf.apply(lambda x: math.sqrt(x)*10)
end = time.time()
time_df = end - start

In [17]:
print("pandas time: {:.5}, simple pandas time: {:.5}".format(time_df, time_fdf))
print("boost times: {:.5}".format(time_df / time_fdf))

pandas time: 0.11073, simple pandas time: 0.010004
boost times: 11.069


during the test, something insteresting happens:

In [18]:
start = time.time()
tmp = df.applymap(lambda x: np.sqrt(x)*10)
end = time.time()
time_df = end - start

In [19]:
start = time.time()
tmp = fdf.apply(lambda x: np.sqrt(x)*10)
end = time.time()
time_df = end - start

In [20]:
print("pandas time: {:.5}, simple pandas time: {:.5}".format(time_df, time_fdf))
print("boost times: {:.5}".format(time_df / time_fdf))

pandas time: 0.44082, simple pandas time: 0.010004
boost times: 44.064


for numpy function, pandas will take more time.

### row-wise

In [21]:
start = time.time()
df['sum'] = df.apply(lambda x: x['math'] +x['chinese'] + x['english'], axis=1)
end = time.time()
time_df = end - start

In [22]:
start = time.time()
fdf[['sum']] = fdf.apply(lambda x: x['math'] +x['chinese'] + x['english'], type='row')
end = time.time()
time_fdf = end - start

In [23]:
print("pandas time: {:.5}, simple pandas time: {:.5}".format(time_df, time_fdf))
print("boost times: {:.5}".format(time_df / time_fdf))

pandas time: 2.4006, simple pandas time: 0.15357
boost times: 15.632


### column-wise

In [24]:
start = time.time()
for i in range(100):
    tmp = df.apply(np.mean, axis=0)
end = time.time()
time_df = end - start
time_df

0.3260931968688965

In [25]:
start = time.time()
for i in range(100):
    tmp = fdf.apply(np.mean, type='column')
end = time.time()
time_fdf = end - start
time_fdf

0.04790663719177246

In [26]:
print("pandas time: {:.5}, simple pandas time: {:.5}".format(time_df, time_fdf))
print("boost times: {:.5}".format(time_df / time_fdf))

pandas time: 0.32609, simple pandas time: 0.047907
boost times: 6.8068
