# TIMING PANDAS 2.0 AGAINST VECTORIZATION AND POLARS

In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.compute as pc
import timeit
print(pd.__version__)

2.0.0


### Create data and write to a CSV

In [2]:
np.random.seed(123)
# create the prices of the first store with random integers between 2 and 200
col1 = np.random.randint(low=2, high=200, size=500000)
#create the random amount by which prices of Store2 are higher.
np.random.seed(341)
# create the second column with random integers between 2 and 20
var = np.random.randint(low=0, high=21, size=500000)
#create the prices of Store2
col2 = col1 + col1 * (var / 100)
np.random.seed(245)
# create the Discountability column with random values of 0 or 1
col3 = np.random.randint(low=0, high=2, size=500000)
df = pd.DataFrame({'Store1': col1, 'Store2': col2, 'Discountability': col3})
print(df.head(9))
df.to_csv('Storeprices.csv', index=False)


   Store1  Store2  Discountability
0     111  132.09                1
1     128  134.40                1
2      68   70.04                0
3     100  102.00                1
4      19   19.76                0
5      85   88.40                1
6     108  108.00                0
7     125  140.00                1
8      59   70.21                1


### Create Pandas 2.0 Dataframes from the .csv file with and without the help of pyarrow

In [3]:
startime=timeit.default_timer()
df_pa = pd.read_csv('Storeprices.csv', dtype_backend="pyarrow", engine="pyarrow")
endtime=timeit.default_timer()
diffread=endtime-startime
print(diffread)

startime=timeit.default_timer()
df2= pd.read_csv('Storeprices.csv')
endtime=timeit.default_timer()
diffread2=endtime-startime
print(diffread2)


0.044916999991983175
0.1962997000082396


### Dataframe Element-wise Operations With Pyarrow Table

In [4]:
#Create a pyarrow Table from a Pandas Dataframe
df_table = pa.Table.from_pandas(df_pa)
s1=timeit.default_timer()
#This function scales columns Store1 and Store2 by the discount factors, 0.2 and 0.3 respectively, 
# and also the Discoutability column. It returns the scaled columns.
def scale_columns(table, column1, column2, mask_column):
    # Get the columns from the PyArrow table
    store1_col = table[column1]
    store2_col = table[column2]
    mask_col = table[mask_column]

    # Scale 'column1' by 0.2 * mask_column using pa.compute.multiply()
    scaled_store1_col = pc.multiply(store1_col, pc.multiply(0.2, mask_col))

    # Scale 'column2' by 0.3 * mask_column using pa.compute.multiply()
    scaled_store2_col = pc.multiply(store2_col, pc.multiply(0.3, mask_col))

    return scaled_store1_col, scaled_store2_col


# Call the scale_columns function
scaled_columns = scale_columns(df_table, 'Store1', 'Store2', 'Discountability')

result_table = pa.Table.from_arrays([
   pc.subtract(df_table[0],scaled_columns[0]),
   pc.subtract(df_table[1],scaled_columns[1]),

], names=['DiscountedStore1', 'DiscountedStore2'])

dff=result_table.to_pandas()
s2=timeit.default_timer()
diff2=s2-s1
print(f"Time elapsed is: {diff2}")
print(dff.head(9))

Time elapsed is: 0.028865800006315112
   DiscountedStore1  DiscountedStore2
0              88.8            92.463
1             102.4            94.080
2              68.0            70.040
3              80.0            71.400
4              19.0            19.760
5              68.0            61.880
6             108.0           108.000
7             100.0            98.000
8              47.2            49.147


### Dataframe Row-wise Operations with Pandas 2.0 apply() Version 1.0

In [5]:


startime=timeit.default_timer()
#Function with a nested function inside. It is the nested function that performs the scaling.
def scale_columns(df, column1, column2, abletodiscount_column):
    # Create a new dataframe to store the scaled columns
    scaled_df = pd.DataFrame()

    def discount_store(row):
        # Get the values from the current row
        store1_value = row[column1]
        store2_value = row[column2]
        abletodiscount_value = row[abletodiscount_column]
        scaled_store1 = store1_value-store1_value * (0.2 * abletodiscount_value)
        scaled_store2 = store2_value-store2_value * (0.3 * abletodiscount_value)

        # Return the scaled values
        return pd.Series({column1: scaled_store1, column2: scaled_store2})

    # Apply the scaling function to each row of the dataframe
    scaled_df = df.apply(discount_store, axis=1)

    return scaled_df

discount_df = scale_columns(df2, 'Store1', 'Store2', 'Discountability')
discount_df.columns=["DiscountStore1","DiscountStore2"]

s2=timeit.default_timer()
diff2=s2-s1
print(f"Time elapsed is: {diff2}")
# Print the resulting scaled dataframe
print(discount_df.head(9))

Time elapsed is: 197.63155489997007
   DiscountStore1  DiscountStore2
0            88.8          92.463
1           102.4          94.080
2            68.0          70.040
3            80.0          71.400
4            19.0          19.760
5            68.0          61.880
6           108.0         108.000
7           100.0          98.000
8            47.2          49.147


### Dataframe Row-wise Operations with Pandas 2.0 apply() Version 2.0

In [6]:
startime=timeit.default_timer()

def scale_columns(df, column1, column2, abletodiscount_column):
    # Create new lists to store the scaled values
    scaled_store1 = []
    scaled_store2 = []

    def discount_store(row):
        # Get the values from the current row
        store1_value = row[column1]
        store2_value = row[column2]
        abletodiscount_value = row[abletodiscount_column]
        scaled_store1.append(store1_value - store1_value * (0.2 * abletodiscount_value))
        scaled_store2.append(store2_value - store2_value * (0.3 * abletodiscount_value))

    # Apply the scaling function to each row of the dataframe
    df.apply(discount_store, axis=1)

    # Create the resulting DataFrame
    scaled_df = pd.DataFrame({column1: scaled_store1, column2: scaled_store2})

    return scaled_df


discount_df = scale_columns(df2, 'Store1', 'Store2', 'Discountability')
discount_df.columns=["DiscountStore1","DiscountStore2"]

s2=timeit.default_timer()
diff2=s2-s1
print(f"Time elapsed is: {diff2}")
# Print the resulting scaled dataframe
print(discount_df.head(9))

Time elapsed is: 252.87545809999574
   DiscountStore1  DiscountStore2
0            88.8          92.463
1           102.4          94.080
2            68.0          70.040
3            80.0          71.400
4            19.0          19.760
5            68.0          61.880
6           108.0         108.000
7           100.0          98.000
8            47.2          49.147


### Row-wise Dataframe Operations with Pandas 2.0 itertuples()

In [7]:

# create an empty list to store the discounted prices
discounted_prices = []
startime=timeit.default_timer()

# iterate using itertuples
for row in df2.itertuples(index=False):
    
    discounted_price = [
        row.Store1-row.Store1 * 0.2*row.Discountability,
        row.Store2-row.Store2*0.3*row.Discountability
    ]
    # append the discounted values to the list
    discounted_prices.append(discounted_price)

# save the list of discounted values to a new DataFrame
discount_df = pd.DataFrame(discounted_prices, columns=['DiscountedStore1', 'DiscountedStore2'])


endtime=timeit.default_timer()
diff=endtime-startime
print(f"Time elapsed is: {diff}")
print(discount_df.head(9))


Time elapsed is: 1.0882243000087328
   DiscountedStore1  DiscountedStore2
0              88.8            92.463
1             102.4            94.080
2              68.0            70.040
3              80.0            71.400
4              19.0            19.760
5              68.0            61.880
6             108.0           108.000
7             100.0            98.000
8              47.2            49.147


### Vectorized Operations

In [8]:
startime=timeit.default_timer()

# scale the first column by 0.2 and the second by 0.3
discounted_prices = pd.DataFrame({
    'DiscountedStore1': df2['Store1'] - df2['Store1'] * 0.2 * df2['Discountability'],
    'DiscountedStore2': df2['Store2'] - df2['Store2'] * 0.3 * df2['Discountability']
})

endtime=timeit.default_timer()
diff=endtime-startime
print(f"Time elapsed is: {diff}")
# print the resulting dataframe
print(discounted_prices.head(9))


Time elapsed is: 0.0536642000079155
   DiscountedStore1  DiscountedStore2
0              88.8            92.463
1             102.4            94.080
2              68.0            70.040
3              80.0            71.400
4              19.0            19.760
5              68.0            61.880
6             108.0           108.000
7             100.0            98.000
8              47.2            49.147


### Dataframe Creation and Vectorized Operations With Polars

In [9]:
import polars as pl


# read a CSV file into a Polars DataFrame
start1=timeit.default_timer()
df22 = pl.read_csv('Storeprices.csv')
end1=timeit.default_timer()
dif=end1-start1

print(f"Time elapsed for reading is: {dif}")
def discounted_value(value, mask_value, factor):
    k = mask_value * factor
    return value -(value*k)

startime=timeit.default_timer()
scaled_df = pl.DataFrame({
    'DiscountedStore1': discounted_value(df22['Store1'], df22['Discountability'], 0.2),
    'DiscountedStore2': discounted_value(df22['Store2'], df22['Discountability'], 0.3)
})


endtime=timeit.default_timer()
diff=endtime-startime
print(f"Time elapsed is: {diff}")
print(scaled_df.head(9))

Time elapsed for reading is: 0.027874399966094643
Time elapsed is: 0.009557300014421344
shape: (9, 2)
┌──────────────────┬──────────────────┐
│ DiscountedStore1 ┆ DiscountedStore2 │
│ ---              ┆ ---              │
│ f64              ┆ f64              │
╞══════════════════╪══════════════════╡
│ 88.8             ┆ 92.463           │
│ 102.4            ┆ 94.08            │
│ 68.0             ┆ 70.04            │
│ 80.0             ┆ 71.4             │
│ 19.0             ┆ 19.76            │
│ 68.0             ┆ 61.88            │
│ 108.0            ┆ 108.0            │
│ 100.0            ┆ 98.0             │
│ 47.2             ┆ 49.147           │
└──────────────────┴──────────────────┘
