## Examples in Pandas



| operation |Pandas (s)| Polars (s)|
|-----------|----------|-----------|
| .read_csv  |  0.94415 | 0.25258  |
| .shape     |  0.0  | 0.0   |
| .columns     |  0.00052  | 0.00015 |
| .unique  | 0.00734 | 0.00337 |
| .value_counts | 0.00831 | 0.00407 |
| .filter | 0.01351 | 0.00442 |
| .groupby.count | 0.01325 | 0.00728 |
| .groupby.agg | 0.01039 | 0.00359 |
| write_csv | 0.04741 | 0.00919 |

see `compare_polars.ipynb` notebook for Polars

In [32]:
# import polars as pl
import pandas as pd
# to enrich the examples in this quickstart with dates
from datetime import datetime, timedelta 
from time import time
# to generate data for the examples
import numpy as np 

# pl.Config.set_fmt_str_lengths(100)

### read csv

In [3]:
file_csv = "../data/311-service-requests.csv"

In [4]:
ts1 = time()
df_pd = pd.read_csv(file_csv, encoding='utf-8', sep=',', 
                    dtype={"Incident Zip":str}, parse_dates=False)
ts2 = time()
delta = ts2-ts1
print(f"Pandas exec time = {delta} sec")

Pandas exec time = 0.9441518783569336 sec


In [20]:
df_pd.head(5)

Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,...,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Garage Lot Name,Ferry Direction,Ferry Terminal Name,Latitude,Longitude,Location
0,26589651,10/31/2013 02:08:41 AM,,NYPD,New York City Police Department,Noise - Street/Sidewalk,Loud Talking,Street/Sidewalk,11432,90-03 169 STREET,...,,,,,,,,40.708275,-73.791604,"(40.70827532593202, -73.79160395779721)"
1,26593698,10/31/2013 02:01:04 AM,,NYPD,New York City Police Department,Illegal Parking,Commercial Overnight Parking,Street/Sidewalk,11378,58 AVENUE,...,,,,,,,,40.721041,-73.909453,"(40.721040535628305, -73.90945306791765)"
2,26594139,10/31/2013 02:00:24 AM,10/31/2013 02:40:32 AM,NYPD,New York City Police Department,Noise - Commercial,Loud Music/Party,Club/Bar/Restaurant,10032,4060 BROADWAY,...,,,,,,,,40.84333,-73.939144,"(40.84332975466513, -73.93914371913482)"
3,26595721,10/31/2013 01:56:23 AM,10/31/2013 02:21:48 AM,NYPD,New York City Police Department,Noise - Vehicle,Car/Truck Horn,Street/Sidewalk,10023,WEST 72 STREET,...,,,,,,,,40.778009,-73.980213,"(40.7780087446372, -73.98021349023975)"
4,26590930,10/31/2013 01:53:44 AM,,DOHMH,Department of Health and Mental Hygiene,Rodent,Condition Attracting Rodents,Vacant Lot,10027,WEST 124 STREET,...,,,,,,,,40.807691,-73.947387,"(40.80769092704951, -73.94738703491433)"


In [5]:
ts1 = time()
print(df_pd.shape)
ts2 = time()
delta = ts2-ts1
print(f"Pandas exec time = {delta} sec")

(111069, 52)
Pandas exec time = 0.0 sec


### show columns

In [6]:
ts1 = time()
print(df_pd.columns)
ts2 = time()
delta = ts2-ts1
print(f"Pandas exec time = {delta} sec")

Index(['Unique Key', 'Created Date', 'Closed Date', 'Agency', 'Agency Name',
       'Complaint Type', 'Descriptor', 'Location Type', 'Incident Zip',
       'Incident Address', 'Street Name', 'Cross Street 1', 'Cross Street 2',
       'Intersection Street 1', 'Intersection Street 2', 'Address Type',
       'City', 'Landmark', 'Facility Type', 'Status', 'Due Date',
       'Resolution Action Updated Date', 'Community Board', 'Borough',
       'X Coordinate (State Plane)', 'Y Coordinate (State Plane)',
       'Park Facility Name', 'Park Borough', 'School Name', 'School Number',
       'School Region', 'School Code', 'School Phone Number', 'School Address',
       'School City', 'School State', 'School Zip', 'School Not Found',
       'School or Citywide Complaint', 'Vehicle Type', 'Taxi Company Borough',
       'Taxi Pick Up Location', 'Bridge Highway Name',
       'Bridge Highway Direction', 'Road Ramp', 'Bridge Highway Segment',
       'Garage Lot Name', 'Ferry Direction', 'Ferry Termina

### groupby count

In [7]:
ts1 = time()
print(df_pd.groupby(['Complaint Type'])['Complaint Type'].count())
ts2 = time()
delta = ts2-ts1
print(f"Pandas exec time = {delta} sec")

Complaint Type
APPLIANCE                   445
Adopt-A-Basket                3
Agency Issues               174
Air Quality                 459
Animal Abuse                164
                           ... 
Water Conservation          130
Water Quality                89
Water System               2636
Window Guard                  2
X-Ray Machine/Equipment       1
Name: Complaint Type, Length: 165, dtype: int64
Pandas exec time = 0.013255834579467773 sec


### unique

In [21]:
ts1 = time()
temp = df_pd['Complaint Type'].unique()
ts2 = time()
delta = ts2-ts1
print(f"Pandas exec time = {delta} sec")

Pandas exec time = 0.0073468685150146484 sec


In [22]:
temp

array(['Noise - Street/Sidewalk', 'Illegal Parking', 'Noise - Commercial',
       'Noise - Vehicle', 'Rodent', 'Blocked Driveway',
       'Noise - House of Worship', 'Street Light Condition',
       'Harboring Bees/Wasps', 'Taxi Complaint', 'Homeless Encampment',
       'Traffic Signal Condition', 'Food Establishment', 'Noise - Park',
       'Broken Muni Meter', 'Benefit Card Replacement',
       'Sanitation Condition', 'ELECTRIC', 'PLUMBING', 'HEATING',
       'GENERAL CONSTRUCTION', 'Street Condition', 'Consumer Complaint',
       'Derelict Vehicles', 'Noise', 'Drinking', 'Indoor Air Quality',
       'Panhandling', 'Derelict Vehicle', 'Lead', 'Water System',
       'Noise - Helicopter', 'Homeless Person Assistance',
       'Root/Sewer/Sidewalk Condition', 'Sidewalk Condition', 'Graffiti',
       'DOF Literature Request', 'Animal in a Park',
       'Overgrown Tree/Branches', 'Air Quality', 'Dirty Conditions',
       'Water Quality', 'Other Enforcement', 'Collection Truck Noise',
     

### value_count

In [23]:
ts1 = time()
temp_pd = df_pd['Complaint Type'].value_counts()
ts2 = time()
delta = ts2-ts1
print(f"Pandas exec time = {delta} sec")

Pandas exec time = 0.008317947387695312 sec


In [24]:
temp_pd

HEATING                           14200
GENERAL CONSTRUCTION               7471
Street Light Condition             7117
DOF Literature Request             5797
PLUMBING                           5373
                                  ...  
Municipal Parking Facility            1
Tunnel Condition                      1
DHS Income Savings Requirement        1
Stalled Sites                         1
X-Ray Machine/Equipment               1
Name: Complaint Type, Length: 165, dtype: int64

### Filter

In [25]:
ts1 = time()
df_pd_water = df_pd[df_pd["Complaint Type"] == "Water System"]
ts2 = time()
delta = ts2-ts1
print(f"Polars exec time = {delta} sec")

Polars exec time = 0.01351475715637207 sec


In [26]:
df_pd_water

Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,...,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Garage Lot Name,Ferry Direction,Ferry Terminal Name,Latitude,Longitude,Location
120,26590780,10/30/2013 11:18:00 PM,,DEP,Department of Environmental Protection,Water System,Hydrant Leaking (WC1),,11236,,...,,,,,,,,40.633285,-73.890499,"(40.63328455805577, -73.89049865475536)"
127,26595918,10/30/2013 11:15:00 PM,,DEP,Department of Environmental Protection,Water System,Possible Water Main Break (Use Comments) (WA1),,11432,82-11 172 STREET,...,,,,,,,,40.719504,-73.794408,"(40.71950372818996, -73.79440796487808)"
154,26595898,10/30/2013 10:49:00 PM,,DEP,Department of Environmental Protection,Water System,No Water/Low Pressure (WA5),,10001,17 WEST 32 STREET,...,,,,,,,,40.747461,-73.986199,"(40.747460631838564, -73.9861991025855)"
157,26591246,10/30/2013 10:48:00 PM,,DEP,Department of Environmental Protection,Water System,No Water/Low Pressure (WA5),,10001,10 WEST 31 STREET,...,,,,,,,,40.746805,-73.986611,"(40.74680468544432, -73.98661066284879)"
159,26594036,10/30/2013 10:47:00 PM,,DEP,Department of Environmental Protection,Water System,No Water/Low Pressure (WA5),,10001,9 WEST 32 STREET,...,,,,,,,,40.747395,-73.986044,"(40.74739473917165, -73.98604392847245)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111032,26437038,10/04/2013 12:24:00 AM,10/04/2013 12:29:00 AM,DEP,Department of Environmental Protection,Water System,Dirty Water (WE),,10022,350 EAST 52 STREET,...,,,,,,,,40.755611,-73.966392,"(40.75561119970559, -73.96639229035979)"
111045,26437764,10/04/2013 12:14:00 AM,10/04/2013 12:14:00 AM,DEP,Department of Environmental Protection,Water System,Dirty Water (WE),,10022,251 EAST 51 STREET,...,,,,,,,,40.755980,-73.969171,"(40.75597977288785, -73.96917140638074)"
111052,26437035,10/04/2013 12:08:00 AM,10/04/2013 12:13:00 AM,DEP,Department of Environmental Protection,Water System,Dirty Water (WE),,10022,325 EAST 54 STREET,...,,,,,,,,40.757069,-73.965933,"(40.75706852462872, -73.96593314322774)"
111062,26439710,10/04/2013 12:03:00 AM,10/04/2013 12:03:00 AM,DEP,Department of Environmental Protection,Water System,Dirty Water (WE),,10022,325 EAST 54 STREET,...,,,,,,,,40.757069,-73.965933,"(40.75706852462872, -73.96593314322774)"


### write_csv

In [29]:
ts1 = time()
df_pd_water.to_csv("df_pd_water.csv", index=False)
ts2 = time()
delta = ts2-ts1
print(f"Pandas exec time = {delta} sec")

Pandas exec time = 0.0474095344543457 sec


### groupby / agg

In [30]:
ts1 = time()
temp = df_pd.groupby(by="Complaint Type").agg({"Latitude":np.sum})
ts2 = time()
delta = ts2-ts1
print(f"Pandas exec time = {delta} sec")

Pandas exec time = 0.010399818420410156 sec


In [31]:
temp

Unnamed: 0_level_0,Latitude
Complaint Type,Unnamed: 1_level_1
APPLIANCE,18131.011875
Adopt-A-Basket,122.279695
Agency Issues,0.000000
Air Quality,18576.403735
Animal Abuse,6657.347589
...,...
Water Conservation,5250.925160
Water Quality,3623.623448
Water System,106229.622722
Window Guard,81.368559
