# Polars


Alternatives to pandas

In [1]:
import polars as pl

In [2]:
dir(pl)

['Any',
 'Array',
 'Binary',
 'Boolean',
 'Catalog',
 'Categorical',
 'CompatLevel',
 'Config',
 'CredentialProvider',
 'CredentialProviderAWS',
 'CredentialProviderAzure',
 'CredentialProviderFunction',
 'CredentialProviderFunctionReturn',
 'CredentialProviderGCP',
 'DataFrame',
 'DataType',
 'Date',
 'Datetime',
 'Decimal',
 'Duration',
 'Enum',
 'Expr',
 'Field',
 'Float32',
 'Float64',
 'GPUEngine',
 'Int128',
 'Int16',
 'Int32',
 'Int64',
 'Int8',
 'LazyFrame',
 'List',
 'Null',
 'Object',
 'SQLContext',
 'Schema',
 'Series',
 'String',
 'StringCache',
 'Struct',
 'Time',
 'UInt16',
 'UInt32',
 'UInt64',
 'UInt8',
 'Unknown',
 'Utf8',
 '__all__',
 '__annotations__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__getattr__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__register_startup_deps',
 '__spec__',
 '__version__',
 '_cpu_check',
 '_reexport',
 '_typing',
 '_utils',
 'align_frames',
 'all',
 'all_horizontal',
 'any',
 'any_horizontal',
 'api',
 '

In [3]:
df = pl.DataFrame()

df

In [4]:
df = pl.DataFrame([1])

df


column_0
i64
1


In [5]:
df = pl.DataFrame([1, 2, 3])

df


column_0
i64
1
2
3


In [6]:
df = pl.DataFrame((1, 2, 3))

df


column_0
i64
1
2
3


In [7]:
df = pl.DataFrame({1, 2, 3})

df


TypeError: DataFrame constructor called with unsupported type 'set' for the `data` parameter

In [8]:
df = pl.DataFrame({'a': 1, 'b': 2, 'c': 3})

df


a,b,c
i64,i64,i64
1,2,3


In [9]:
df = pl.DataFrame({'colA': 1, 'colB': 2, 'colC': 3})

df


colA,colB,colC
i64,i64,i64
1,2,3


In [11]:
df = pl.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35],
    "city": ["New York", "Los Angeles", "Chicago"]
})

df

name,age,city
str,i64,str
"""Alice""",25,"""New York"""
"""Bob""",30,"""Los Angeles"""
"""Charlie""",35,"""Chicago"""


In [12]:
print(df)

shape: (3, 3)
┌─────────┬─────┬─────────────┐
│ name    ┆ age ┆ city        │
│ ---     ┆ --- ┆ ---         │
│ str     ┆ i64 ┆ str         │
╞═════════╪═════╪═════════════╡
│ Alice   ┆ 25  ┆ New York    │
│ Bob     ┆ 30  ┆ Los Angeles │
│ Charlie ┆ 35  ┆ Chicago     │
└─────────┴─────┴─────────────┘


In [13]:
# Selecting a Column
age_column = df["age"]
age_column

age
i64
25
30
35


In [14]:
#  Selecting Multiple Columns
subset = df[["name", "city"]]

subset

name,city
str,str
"""Alice""","""New York"""
"""Bob""","""Los Angeles"""
"""Charlie""","""Chicago"""


In [15]:
# Filtering Rows
filtered_df = df.filter(pl.col("age") > 28)

filtered_df

name,age,city
str,i64,str
"""Bob""",30,"""Los Angeles"""
"""Charlie""",35,"""Chicago"""


In [16]:
# Adding a New Column
df = df.with_columns((pl.col("age") * 2).alias("double_age"))

df

name,age,city,double_age
str,i64,str,i64
"""Alice""",25,"""New York""",50
"""Bob""",30,"""Los Angeles""",60
"""Charlie""",35,"""Chicago""",70


In [17]:
# Renaming a Column
df = df.rename({"double_age": "twice_age"})

df


name,age,city,twice_age
str,i64,str,i64
"""Alice""",25,"""New York""",50
"""Bob""",30,"""Los Angeles""",60
"""Charlie""",35,"""Chicago""",70


In [18]:
#  Dropping a Column
df = df.drop("twice_age")
df

name,age,city
str,i64,str
"""Alice""",25,"""New York"""
"""Bob""",30,"""Los Angeles"""
"""Charlie""",35,"""Chicago"""


In [20]:
# Sorting a DataFrame

sorted_df = df.sort("age")

df

name,age,city
str,i64,str
"""Alice""",25,"""New York"""
"""Bob""",30,"""Los Angeles"""
"""Charlie""",35,"""Chicago"""


In [21]:
df.sort?


[0;31mSignature:[0m
[0mdf[0m[0;34m.[0m[0msort[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mby[0m[0;34m:[0m [0;34m'IntoExpr | Iterable[IntoExpr]'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0mmore_by[0m[0;34m:[0m [0;34m'IntoExpr'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdescending[0m[0;34m:[0m [0;34m'bool | Sequence[bool]'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnulls_last[0m[0;34m:[0m [0;34m'bool | Sequence[bool]'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmultithreaded[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmaintain_order[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m'DataFrame'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Sort the dataframe by the given columns.

Parameters
----------
by
    Column(s) to sort by. Accept

In [23]:
sorted_df = df.sort("age", descending=True)
sorted_df

name,age,city
str,i64,str
"""Charlie""",35,"""Chicago"""
"""Bob""",30,"""Los Angeles"""
"""Alice""",25,"""New York"""


In [25]:
# Grouping and Aggregating

grouped_df = df.group_by("city").agg(pl.col("age").mean())

grouped_df

city,age
str,f64
"""Chicago""",35.0
"""New York""",25.0
"""Los Angeles""",30.0


In [26]:
# Concatenating DataFrames


df2 = pl.DataFrame({
    "name": ["Dave", "Eva"],
    "age": [40, 45],
    "city": ["Miami", "Seattle"]
})

concat_df = pl.concat([df, df2])

concat_df

name,age,city
str,i64,str
"""Alice""",25,"""New York"""
"""Bob""",30,"""Los Angeles"""
"""Charlie""",35,"""Chicago"""
"""Dave""",40,"""Miami"""
"""Eva""",45,"""Seattle"""


In [27]:
# Reading a CSV File

df_csv = pl.read_csv("data.csv")

df_csv

name,age,salary
str,i64,i64
"""ramesh""",12,242343
"""ganesh""",34,3243
"""suresh""",78,34234


In [29]:
#  Writing to a CSV File

df.write_csv("output.csv")


In [30]:
#  Handling Missing Values

df = df.fill_null("Unknown")

df

name,age,city
str,i64,str
"""Alice""",25,"""New York"""
"""Bob""",30,"""Los Angeles"""
"""Charlie""",35,"""Chicago"""


In [31]:
# Casting Data Types

df

name,age,city
str,i64,str
"""Alice""",25,"""New York"""
"""Bob""",30,"""Los Angeles"""
"""Charlie""",35,"""Chicago"""


In [32]:
df = df.with_columns(pl.col("age").cast(pl.Int32))
df

name,age,city
str,i32,str
"""Alice""",25,"""New York"""
"""Bob""",30,"""Los Angeles"""
"""Charlie""",35,"""Chicago"""


In [34]:
# Creating a LazyFrame

df.lazy()

In [35]:
lazy_df = df.lazy()
result = lazy_df.filter(pl.col("age") > 30).collect()

result

name,age,city
str,i32,str
"""Charlie""",35,"""Chicago"""


In [40]:
# Applying a Custom Function

def custom_function(x):
    return x + 10

df = df.with_columns(pl.col("age").map_elements(custom_function).alias("age_plus_10"))

df

Expr.map_elements is significantly slower than the native expressions API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - pl.col("age").map_elements(custom_function)
with this one instead:
  + pl.col("age") + 10

  df = df.with_columns(pl.col("age").map_elements(custom_function).alias("age_plus_10"))
  df = df.with_columns(pl.col("age").map_elements(custom_function).alias("age_plus_10"))


name,age,city,age_plus_10
str,i32,str,i64
"""Alice""",25,"""New York""",35
"""Bob""",30,"""Los Angeles""",40
"""Charlie""",35,"""Chicago""",45


In [43]:
help(pl.col("age"))

Help on Expr in module polars.expr.expr object:

class Expr(builtins.object)
 |  Expressions that can be used in various contexts.
 |
 |  Methods defined here:
 |
 |  __abs__(self) -> 'Expr'
 |
 |  __add__(self, other: 'IntoExpr') -> 'Expr'
 |      # operators
 |
 |  __and__(self, other: 'IntoExprColumn | int | bool') -> 'Expr'
 |
 |  __array_ufunc__(self, ufunc: 'Callable[..., Any]', method: 'str', *inputs: 'Any', **kwargs: 'Any') -> 'Expr'
 |      Numpy universal functions.
 |
 |  __bool__(self) -> 'NoReturn'
 |
 |  __eq__(self, other: 'IntoExpr') -> 'Expr'
 |      Return self==value.
 |
 |  __floordiv__(self, other: 'IntoExpr') -> 'Expr'
 |
 |  __ge__(self, other: 'IntoExpr') -> 'Expr'
 |      Return self>=value.
 |
 |  __getstate__(self) -> 'bytes'
 |      Helper for pickle.
 |
 |  __gt__(self, other: 'IntoExpr') -> 'Expr'
 |      Return self>value.
 |
 |  __invert__(self) -> 'Expr'
 |
 |  __le__(self, other: 'IntoExpr') -> 'Expr'
 |      Return self<=value.
 |
 |  __lt__(self, oth