In [1]:
import numpy as np
import pandas as pd

from numba import njit, jit

# Create dataset

In [2]:
def prepare_dataset(size):
    
    # Generate raw data
    data = {
        "year": np.random.choice([str(x) for x in range(2000, 2020)], size),
        "month": np.random.choice([str(x).zfill(2) for x in range(1, 12)], size),
        "day": np.random.choice([str(x).zfill(2) for x in range(1, 29)], size),
        "hour": np.random.choice(range(24), size),
        "minute": np.random.choice(range(60), size),
        "second": np.random.choice(range(60), size),
        "cent": np.random.choice(range(100), size),
    }
    
    # Create a dataframe
    df = pd.DataFrame(data.values(), index=data.keys()).T
    
    # Create 'date' and 'time' columns
    df["date"] = df["year"] + "-" + df["month"] + "-" + df["day"]
    df["time"] = df["cent"] + 100*(df["second"] + 100*(df["minute"] + 100*df["hour"]))
    
    # drop used columns
    for col in data.keys():
        del df[col]
        
    return df

In [3]:
size = 100_000
df = prepare_dataset(size)

# Numba functions

In [4]:
def zfill(df):
    """
        1. Transform time to str
        2. zfill
        3. split time as string lists
        4. pd.to_datetime
    """

    aux = df["time"].apply(str).apply(lambda x: x.zfill(8)).str
    return pd.to_datetime(df["date"] + " " + aux[:2] + ":" + aux[2:4] + ":" + aux[4:6] + "." + aux[6:])

In [5]:
def fix_time_individual(df):
    """
        1. pandas.apply a jit function to add 0 to time
        2. concat date + time
        3. change to np.datetime64
    """
    
    @jit
    def _fix_time(x):
        aux = "0"*(8 - len(str(x))) + str(x)
        return aux[:2] + ":" + aux[2:4] + ":" + aux[4:6] + "." + aux[6:]
    
    return (df["date"] + " " + df["time"].apply(_fix_time)).astype(np.datetime64)

In [6]:
def fix_time_np_string(df):
    """
        1. Use a jit function to add 0 to each time
        2. concat date + time
        3. change to np.datetime64
    """

    @jit
    def _fix_time(mlist):
    
        out = np.empty(mlist.shape, dtype=np.object)

        for i in range(len(mlist)):

            elem = str(mlist[i])
            aux = "0"*(8 - len(elem)) + elem

            out[i] = aux[:2] + ":" + aux[2:4] + ":" + aux[4:6] + "." + aux[6:]

        return out
    
    return (df["date"].values + " " + _fix_time(df["time"].values)).astype(np.datetime64)

In [7]:
def fix_time_np_datetime(df):
    """
        1. Iterate time and date with jit function
        2. Transform each element to string and add 0s
        3. Split the string
        4. Cast each element to np.datetime64
    """
    
    @jit
    def _fix_date(mdate, mtime):

        out = np.empty(mtime.shape, dtype="datetime64[s]")

        for i in range(len(mtime)):

            elem = str(mtime[i])
            aux = "0"*(8 - len(elem)) + elem

            aux = mdate[i] + " " + aux[:2] + ":" + aux[2:4] + ":" + aux[4:6] + "." + aux[6:]

            out[i] = np.datetime64(aux)

        return out
    
    return _fix_date(df["date"].values, df["time"].values)

In [8]:
def np_divmod_jit(df):
    """
        1. Iterate time and date with jit function
        2. Use np.divmod to transfom HHMMSSCC to miliseconds integer
        3. Cast date as np.datetime and time to timedelta
        4. Sum date and time
    """
        
    @jit
    def _fix_date(mdate, mtime):

        time_out = np.empty(mtime.shape[0], dtype=np.int32)

        for i in range(mtime.shape[0]):
            aux, cent = np.divmod(mtime[i], 100)
            aux, seconds = np.divmod(aux, 100)
            hours, minutes = np.divmod(aux, 100)

            time_out[i] = 10*(cent + 100*(seconds + 60*(minutes + 60*hours)))

        return mdate.astype(np.datetime64) + time_out.astype("timedelta64[ms]")
    
    return _fix_date(df["date"].values, df["time"].values)

In [9]:
def divmod_njit(df):
    """
        1. Iterate time with njit function
        2. Use divmod to transfom HHMMSSCC to miliseconds integer
        3. Outside the njit function cast date as np.datetime and time to timedelta
        4. Sum date and time
    """

    @njit
    def _fix_time(mtime):

        time_out = np.empty(mtime.shape[0], dtype=np.int32)

        for i in range(mtime.shape[0]):
            aux, cent = divmod(mtime[i], 100)
            aux, seconds = divmod(aux, 100)
            hours, minutes = divmod(aux, 100)

            time_out[i] = 10*(cent + 100*(seconds + 60*(minutes + 60*hours)))

        return df["date"].values.astype(np.datetime64) +\
                 _fix_time(df["time"].values).astype("timedelta64[ms]")

# Tests 1e5 elements

In [10]:
%%timeit
zfill(df)

302 ms ± 1.75 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%%timeit
fix_time_individual(df)

686 ms ± 15.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
%%timeit
fix_time_np_string(df)

675 ms ± 21.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%%timeit
fix_time_np_datetime(df)

756 ms ± 9.92 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%%timeit
np_divmod_jit(df)

1.34 s ± 27.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
%%timeit
divmod_njit(df)

95.2 µs ± 544 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


# Tests all elements

In [17]:
df = dfg.copy()

In [18]:
%%timeit
zfill(df)

31 s ± 587 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%%timeit
fix_time_individual(df)

41.9 s ± 292 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
%%timeit
fix_time_np_string(df)

32.1 s ± 736 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
%%timeit
fix_time_np_datetime(df)

32.7 s ± 180 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
%%timeit
np_divmod_jit(df)

1min 40s ± 1.16 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
%%timeit
divmod_njit(df)

2.25 s ± 180 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Create data

In [16]:
prepare_dataset(100_000)
print("done")

- prepare_dataset in 0.14 min
- prepare_dataset2 in 0.13 min
done


In [17]:
prepare_dataset(1_000_000)
print("done")

- prepare_dataset in 1.39 min
- prepare_dataset2 in 1.35 min
done
