In [1]:
import numpy as np
import matplotlib.pyplot as plt
import time
import os

### Import Pandas

In [2]:
import pandas as pd

### Import Modin

In [3]:
#!conda install modin
import psutil
cores = psutil.cpu_count(logical=False)

os.environ["MODIN_ENGINE"] = "dask"  # Modin will use Dask
os.environ["MODIN_CPUS"] = str(cores)

print(f"Found {cores} physical cores: setting done")

from distributed import Client
client = Client()

import modin.pandas as mpd

Found 80 physical cores: setting done


## Let's create a syntetic dataset

In [4]:
filename = "example.csv"

if not os.path.exists(filename):
    array = np.random.randint(low=100,high=10000,size=(2**18,2**8))
    np.savetxt('example.csv', array, delimiter=",")
    print("File saved")

else:
    print("File found")

File found


## Load the synthetic Dataset

#### Using Pandas

In [5]:
%%time
pandas_df = pd.read_csv(filename, names=["col{}".format(i) for i in range(256)])

CPU times: user 13.8 s, sys: 2.74 s, total: 16.6 s
Wall time: 15.8 s


In [6]:
%%time
modin_df = mpd.read_csv(filename, names=["col{}".format(i) for i in range(256)])

CPU times: user 2.07 s, sys: 923 ms, total: 3 s
Wall time: 3.33 s


## Data transformations

In [7]:
t0= time.time()

pandas_df.applymap(lambda x: x**5+ x**2)
pandas_time = time.time() - t0
print(f"Stock Pandas wall time for completion:{pandas_time}s")

Stock Pandas wall time for completion:16.50693702697754s


In [8]:
t0= time.time()

modin_df.applymap(lambda x: x**5+ x**2)
modin_time = time.time() - t0
print(f"Modin wall time for completion:{modin_time}s")



Modin wall time for completion:0.40700316429138184s


In [9]:
print(f"Modin was {round(pandas_time/modin_time)}X faster than Stock Pandas!!")

Modin was 41X faster than Stock Pandas!!


## Concatenating Datasets

In [10]:
t0 = time.time()

pd.concat([pandas_df, pandas_df, pandas_df, pandas_df], axis=0)
pandas_time = time.time() - t0
print(f"Stock Pandas wall time for completion:{pandas_time}s")

Stock Pandas wall time for completion:0.8266844749450684s


In [11]:
t0 = time.time()

mpd.concat([modin_df, modin_df, modin_df, modin_df], axis=0)
modin_time = time.time() - t0
print(f"Modin wall time for completion:{modin_time}s")

Modin wall time for completion:0.028738975524902344s


In [12]:
print(f"Modin was {round(pandas_time/modin_time)}X faster than Stock Pandas!!")

Modin was 29X faster than Stock Pandas!!


## Appending Datasets

In [13]:
t0 = time.time()

%time pandas_df.append(pandas_df)
pandas_time = time.time() - t0
print(f"Stock Pandas wall time for completion:{pandas_time}s")

CPU times: user 271 ms, sys: 267 ms, total: 538 ms
Wall time: 424 ms
Stock Pandas wall time for completion:0.4279768466949463s


In [14]:
t0 = time.time()

%time modin_df.append(modin_df)
modin_time = time.time() - t0
print(f"Modin wall time for completion:{modin_time}s")

CPU times: user 21.5 ms, sys: 5.93 ms, total: 27.5 ms
Wall time: 15.8 ms
Modin wall time for completion:0.0174252986907959s




In [15]:
print(f"Modin was {round(pandas_time/modin_time)}X faster than Stock Pandas!!")

Modin was 25X faster than Stock Pandas!!
