# TIMING PANDAS 2.0 AGAINST VECTORIZATION AND POLARS

In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.compute as pc
import timeit
print(pd.__version__)

2.0.0


### Create data and write to a CSV

In [2]:
np.random.seed(123)
# create the first column with random integers between 2 and 500000
col1 = np.random.randint(low=2, high=200, size=500000)

np.random.seed(341)
# create the second column with random integers between 2 and 500000
col2 = np.random.randint(low=2, high=200, size=500000)

np.random.seed(245)
# create the third column with random integers either 0 or 1
col3 = np.random.randint(low=0, high=2, size=500000)

#The df Dataframe contains data about the inventory number of 500000 merchandize items
#in Store1 and in Store2. The third column contains the discount 'Mask', i.e., whether an item 
#is allowed to be discounted or not.

df = pd.DataFrame({'Store1': col1, 'Store2': col2, 'Discountability': col3})
print(df.head(9))
df.to_csv('Storeprices.csv', index=False)


   Store1  Store2  Discountability
0     111     117                1
1     128     103                1
2      68      37                0
3     100      25                1
4      19       4                0
5      85      28                1
6     108     134                0
7     125      70                1
8      59     183                1


### Create Pandas 2.0 Dataframes from the .csv file with and without the help of pyarrow

In [3]:
startime=timeit.default_timer()
df_pa = pd.read_csv('Storeprices.csv', dtype_backend="pyarrow", engine="pyarrow")
endtime=timeit.default_timer()
diffread=endtime-startime
print(diffread)

startime=timeit.default_timer()
df2= pd.read_csv('Storeprices.csv')
endtime=timeit.default_timer()
diffread2=endtime-startime
print(diffread2)


0.07353810002678074
0.1582617000094615


### Dataframe Element-wise Operations With Pyarrow table

In [55]:
#Create a pyarrow Table from a Pandas Dataframe
df_table = pa.Table.from_pandas(df_pa)

s1=timeit.default_timer()
#This function scales columns Store1 and Store2 by the discount factors, 0.2 and 0.3 respectively, 
# and also the Discoutability column. It returns the scaled columns.
def scale_columns(table, column1, column2, mask_column):
    # Get the columns from the PyArrow table
    store1_col = table[column1]
    store2_col = table[column2]
    mask_col = table[mask_column]

    # Scale 'column1' by 0.2 * mask_column using pa.compute.multiply()
    scaled_store1_col = pc.multiply(store1_col, pc.multiply(0.2, mask_col))

    # Scale 'column2' by 0.3 * mask_column using pa.compute.multiply()
    scaled_store2_col = pc.multiply(store2_col, pc.multiply(0.3, mask_col))

    return scaled_store1_col, scaled_store2_col


# Call the scale_columns function
scaled_columns = scale_columns(df_table, 'Store1', 'Store2', 'Discountability')

result_table = pa.Table.from_arrays([
   pc.subtract(df_table[0],scaled_columns[0]),
   pc.subtract(df_table[1],scaled_columns[1]),

], names=['DiscountedStore1', 'DiscountedStore2'])

dff=result_table.to_pandas()
s2=timeit.default_timer()
diff2=s2-s1
print(f"Time elapsed is: {diff2}")
print(dff.head(9))

Time elapsed is: 0.031649400014430285
   DiscountedStore1  DiscountedStore2
0              88.8              81.9
1             102.4              72.1
2              68.0              37.0
3              80.0              17.5
4              19.0               4.0
5              68.0              19.6
6             108.0             134.0
7             100.0              49.0
8              47.2             128.1


### Dataframe Row-wise Operations with Pandas 2.0 apply()

In [4]:


startime=timeit.default_timer()
#Function with a nested function inside. It is the nested function that performs the scaling.
def scale_columns(df, column1, column2, abletodiscount_column):
    # Create a new dataframe to store the scaled columns
    scaled_df = pd.DataFrame()

    def discount_store(row):
        # Get the values from the current row
        store1_value = row[column1]
        store2_value = row[column2]
        abletodiscount_value = row[abletodiscount_column]
        scaled_store1 = store1_value-store1_value * (0.2 * abletodiscount_value)
        scaled_store2 = store2_value-store2_value * (0.3 * abletodiscount_value)

        # Return the scaled values
        return pd.Series({column1: scaled_store1, column2: scaled_store2})

    # Apply the scaling function to each row of the dataframe
    scaled_df = df.apply(discount_store, axis=1)

    return scaled_df

discount_df = scale_columns(df, 'Store1', 'Store2', 'Discountability')
discount_df.columns=["DiscountStore1","DiscountStore2"]

s2=timeit.default_timer()
diff2=s2-s1
print(f"Time elapsed is: {diff2}")
# Print the resulting scaled dataframe
print(discount_df.head(9))

NameError: name 's1' is not defined

### Row-wise Dataframe Operations with Pandas 2.0 itertuples()

In [62]:

# create an empty list to store the discounted prices
discounted_prices = []
startime=timeit.default_timer()

# iterate using itertuples
for row in df2.itertuples(index=False):
    
    discounted_price = [
        row.Store1-row.Store1 * 0.2*row.Discountability,
        row.Store2-row.Store2*0.3*row.Discountability
    ]
    # append the discounted values to the list
    discounted_prices.append(discounted_price)

# save the list of discounted values to a new DataFrame
discount_df = pd.DataFrame(discounted_prices, columns=['DiscountedStore1', 'DiscountedStore2'])


endtime=timeit.default_timer()
diff=endtime-startime
print(f"Time elapsed is: {diff}")
print(discount_df.head(9))


Time elapsed is: 1.578000300010899
   DiscountedStore1  DiscountedStore2
0              88.8              81.9
1             102.4              72.1
2              68.0              37.0
3              80.0              17.5
4              19.0               4.0
5              68.0              19.6
6             108.0             134.0
7             100.0              49.0
8              47.2             128.1


### Vectorized Operations

In [60]:
startime=timeit.default_timer()

# scale the first column by 0.2 and the second by 0.3
discounted_prices = pd.DataFrame({
    'DiscountedStore1': df2['Store1'] - df2['Store1'] * 0.2 * df2['Discountability'],
    'DiscountedStore2': df2['Store2'] - df2['Store2'] * 0.3 * df2['Discountability']
})

endtime=timeit.default_timer()
diff=endtime-startime
print(f"Time elapsed is: {diff}")
# print the resulting dataframe
print(discounted_prices.head(9))


Time elapsed is: 0.01958970000850968
   DiscountedStore1  DiscountedStore2
0              88.8              81.9
1             102.4              72.1
2              68.0              37.0
3              80.0              17.5
4              19.0               4.0
5              68.0              19.6
6             108.0             134.0
7             100.0              49.0
8              47.2             128.1


### Dataframe Creation and Vectorized Operations With Polars

In [59]:
import polars as pl


# read a CSV file into a Polars DataFrame
start1=timeit.default_timer()
df22 = pl.read_csv('Storeprices.csv')
end1=timeit.default_timer()
dif=end1-start1

print(f"Time elapsed for reading is: {dif}")
def discounted_value(value, mask_value, factor):
    k = mask_value * factor
    return value -(value*k)

startime=timeit.default_timer()
scaled_df = pl.DataFrame({
    'DiscountedStore1': discounted_value(df22['Store1'], df22['Discountability'], 0.2),
    'DiscountedStore2': discounted_value(df22['Store2'], df22['Discountability'], 0.3)
})


endtime=timeit.default_timer()
diff=endtime-startime
print(f"Time elapsed is: {diff}")
print(scaled_df.head(9))

Time elapsed for reading is: 0.018297300004633144
Time elapsed is: 0.010131100018043071
shape: (9, 2)
┌──────────────────┬──────────────────┐
│ DiscountedStore1 ┆ DiscountedStore2 │
│ ---              ┆ ---              │
│ f64              ┆ f64              │
╞══════════════════╪══════════════════╡
│ 88.8             ┆ 81.9             │
│ 102.4            ┆ 72.1             │
│ 68.0             ┆ 37.0             │
│ 80.0             ┆ 17.5             │
│ 19.0             ┆ 4.0              │
│ 68.0             ┆ 19.6             │
│ 108.0            ┆ 134.0            │
│ 100.0            ┆ 49.0             │
│ 47.2             ┆ 128.1            │
└──────────────────┴──────────────────┘
