## Importing Data Into Polars

In this section, we'll learn how to import CSV, Excel (including multiple files and sheets) into `Polars`.

We'll focus on `Polars` because of its speed and elegant syntax.

## Load required libraries

In [2]:
# Import packages
import polars as pl
import polars.selectors as cs
import os 
import sys 
import glob

# Display my system version
print(f'My Python version is {sys.version};\nPolars version is {pl.__version__}')

My Python version is 3.12.4 (main, Jul  1 2024, 00:48:18) [Clang 15.0.0 (clang-1500.3.9.4)];
Polars version is 1.8.2
My Python version is 3.12.4 (main, Jul  1 2024, 00:48:18) [Clang 15.0.0 (clang-1500.3.9.4)];
Polars version is 1.8.2


## Importing CSV file

In [27]:
# Import South Sudan 2008 Census Data
file_path = '../00-data/ss_2008_census_data_raw.csv'
(
    pl.read_csv(
        file_path,
        # Ignoring error message
        ignore_errors=True
    )
)

Region,Region Name,Region - RegionId,Variable,Variable Name,Age,Age Name,Scale,Units,2008
str,str,str,str,str,str,str,str,str,i64
"""KN.A2""","""Upper Nile""","""SS-NU""","""KN.B2""","""Population, Total (Number)""","""KN.C1""","""Total""","""units""","""Persons""",964353
"""KN.A2""","""Upper Nile""","""SS-NU""","""KN.B2""","""Population, Total (Number)""","""KN.C2""","""0 to 4""","""units""","""Persons""",150872
"""KN.A2""","""Upper Nile""","""SS-NU""","""KN.B2""","""Population, Total (Number)""","""KN.C3""","""5 to 9""","""units""","""Persons""",151467
"""KN.A2""","""Upper Nile""","""SS-NU""","""KN.B2""","""Population, Total (Number)""","""KN.C4""","""10 to 14""","""units""","""Persons""",126140
"""KN.A2""","""Upper Nile""","""SS-NU""","""KN.B2""","""Population, Total (Number)""","""KN.C5""","""15 to 19""","""units""","""Persons""",103804
…,…,…,…,…,…,…,…,…,…
"""KN.A11""","""Eastern Equatoria""","""SS-EE""","""KN.B8""","""Population, Female (Number)""","""KN.C14""","""60 to 64""","""units""","""Persons""",5274
"""KN.A11""","""Eastern Equatoria""","""SS-EE""","""KN.B8""","""Population, Female (Number)""","""KN.C22""","""65+""","""units""","""Persons""",8637
"""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""",
"""Source:""","""National Bureau of Statistics,…","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""","""NA""",


In [28]:
# Import South Sudan 2008 Census Data
census_raw = (
    pl.read_csv(
        '../00-data/ss_2008_census_data_raw.csv',
        
        # Specifying missing values as 'NA'
        null_values='NA'
    )
)

# Inspect output
print(census_raw)

shape: (453, 10)
┌─────────┬─────────┬─────────┬─────────┬───┬────────┬───────┬────────┬────────┐
│ Region  ┆ Region  ┆ Region  ┆ Variabl ┆ … ┆ Age    ┆ Scale ┆ Units  ┆ 2008   │
│ ---     ┆ Name    ┆ - Regio ┆ e       ┆   ┆ Name   ┆ ---   ┆ ---    ┆ ---    │
│ str     ┆ ---     ┆ nId     ┆ ---     ┆   ┆ ---    ┆ str   ┆ str    ┆ i64    │
│         ┆ str     ┆ ---     ┆ str     ┆   ┆ str    ┆       ┆        ┆        │
│         ┆         ┆ str     ┆         ┆   ┆        ┆       ┆        ┆        │
╞═════════╪═════════╪═════════╪═════════╪═══╪════════╪═══════╪════════╪════════╡
│ KN.A2   ┆ Upper   ┆ SS-NU   ┆ KN.B2   ┆ … ┆ Total  ┆ units ┆ Person ┆ 964353 │
│         ┆ Nile    ┆         ┆         ┆   ┆        ┆       ┆ s      ┆        │
│ KN.A2   ┆ Upper   ┆ SS-NU   ┆ KN.B2   ┆ … ┆ 0 to 4 ┆ units ┆ Person ┆ 150872 │
│         ┆ Nile    ┆         ┆         ┆   ┆        ┆       ┆ s      ┆        │
│ KN.A2   ┆ Upper   ┆ SS-NU   ┆ KN.B2   ┆ … ┆ 5 to 9 ┆ units ┆ Person ┆ 151467 │
│         ┆

In [29]:
# Pull out the first 5 rows
print(census_raw.head(7))

shape: (7, 10)
┌────────┬─────────┬─────────┬─────────┬───┬─────────┬───────┬────────┬────────┐
│ Region ┆ Region  ┆ Region  ┆ Variabl ┆ … ┆ Age     ┆ Scale ┆ Units  ┆ 2008   │
│ ---    ┆ Name    ┆ - Regio ┆ e       ┆   ┆ Name    ┆ ---   ┆ ---    ┆ ---    │
│ str    ┆ ---     ┆ nId     ┆ ---     ┆   ┆ ---     ┆ str   ┆ str    ┆ i64    │
│        ┆ str     ┆ ---     ┆ str     ┆   ┆ str     ┆       ┆        ┆        │
│        ┆         ┆ str     ┆         ┆   ┆         ┆       ┆        ┆        │
╞════════╪═════════╪═════════╪═════════╪═══╪═════════╪═══════╪════════╪════════╡
│ KN.A2  ┆ Upper   ┆ SS-NU   ┆ KN.B2   ┆ … ┆ Total   ┆ units ┆ Person ┆ 964353 │
│        ┆ Nile    ┆         ┆         ┆   ┆         ┆       ┆ s      ┆        │
│ KN.A2  ┆ Upper   ┆ SS-NU   ┆ KN.B2   ┆ … ┆ 0 to 4  ┆ units ┆ Person ┆ 150872 │
│        ┆ Nile    ┆         ┆         ┆   ┆         ┆       ┆ s      ┆        │
│ KN.A2  ┆ Upper   ┆ SS-NU   ┆ KN.B2   ┆ … ┆ 5 to 9  ┆ units ┆ Person ┆ 151467 │
│        ┆ Ni

In [30]:
# Pull out the last 5 rows
print(census_raw.tail())

shape: (5, 10)
┌─────────┬─────────┬─────────┬─────────┬───┬─────────┬───────┬─────────┬──────┐
│ Region  ┆ Region  ┆ Region  ┆ Variabl ┆ … ┆ Age     ┆ Scale ┆ Units   ┆ 2008 │
│ ---     ┆ Name    ┆ - Regio ┆ e       ┆   ┆ Name    ┆ ---   ┆ ---     ┆ ---  │
│ str     ┆ ---     ┆ nId     ┆ ---     ┆   ┆ ---     ┆ str   ┆ str     ┆ i64  │
│         ┆ str     ┆ ---     ┆ str     ┆   ┆ str     ┆       ┆         ┆      │
│         ┆         ┆ str     ┆         ┆   ┆         ┆       ┆         ┆      │
╞═════════╪═════════╪═════════╪═════════╪═══╪═════════╪═══════╪═════════╪══════╡
│ KN.A11  ┆ Eastern ┆ SS-EE   ┆ KN.B8   ┆ … ┆ 60 to   ┆ units ┆ Persons ┆ 5274 │
│         ┆ Equator ┆         ┆         ┆   ┆ 64      ┆       ┆         ┆      │
│         ┆ ia      ┆         ┆         ┆   ┆         ┆       ┆         ┆      │
│ KN.A11  ┆ Eastern ┆ SS-EE   ┆ KN.B8   ┆ … ┆ 65+     ┆ units ┆ Persons ┆ 8637 │
│         ┆ Equator ┆         ┆         ┆   ┆         ┆       ┆         ┆      │
│         ┆ i

In [31]:
# Pull out sample rows
print(census_raw.sample(7))

shape: (7, 10)
┌────────┬─────────┬─────────┬─────────┬───┬─────────┬───────┬─────────┬───────┐
│ Region ┆ Region  ┆ Region  ┆ Variabl ┆ … ┆ Age     ┆ Scale ┆ Units   ┆ 2008  │
│ ---    ┆ Name    ┆ - Regio ┆ e       ┆   ┆ Name    ┆ ---   ┆ ---     ┆ ---   │
│ str    ┆ ---     ┆ nId     ┆ ---     ┆   ┆ ---     ┆ str   ┆ str     ┆ i64   │
│        ┆ str     ┆ ---     ┆ str     ┆   ┆ str     ┆       ┆         ┆       │
│        ┆         ┆ str     ┆         ┆   ┆         ┆       ┆         ┆       │
╞════════╪═════════╪═════════╪═════════╪═══╪═════════╪═══════╪═════════╪═══════╡
│ KN.A11 ┆ Eastern ┆ SS-EE   ┆ KN.B8   ┆ … ┆ 10 to   ┆ units ┆ Persons ┆ 61423 │
│        ┆ Equator ┆         ┆         ┆   ┆ 14      ┆       ┆         ┆       │
│        ┆ ia      ┆         ┆         ┆   ┆         ┆       ┆         ┆       │
│ KN.A4  ┆ Unity   ┆ SS-UY   ┆ KN.B5   ┆ … ┆ 55 to   ┆ units ┆ Persons ┆ 4497  │
│        ┆         ┆         ┆         ┆   ┆ 59      ┆       ┆         ┆       │
│ KN.A4  ┆ Un

In [17]:
# Print out the columns with their data types
census_raw.glimpse()

Rows: 453
Columns: 10
$ Region            <str> 'KN.A2', 'KN.A2', 'KN.A2', 'KN.A2', 'KN.A2', 'KN.A2', 'KN.A2', 'KN.A2', 'KN.A2', 'KN.A2'
$ Region Name       <str> 'Upper Nile', 'Upper Nile', 'Upper Nile', 'Upper Nile', 'Upper Nile', 'Upper Nile', 'Upper Nile', 'Upper Nile', 'Upper Nile', 'Upper Nile'
$ Region - RegionId <str> 'SS-NU', 'SS-NU', 'SS-NU', 'SS-NU', 'SS-NU', 'SS-NU', 'SS-NU', 'SS-NU', 'SS-NU', 'SS-NU'
$ Variable          <str> 'KN.B2', 'KN.B2', 'KN.B2', 'KN.B2', 'KN.B2', 'KN.B2', 'KN.B2', 'KN.B2', 'KN.B2', 'KN.B2'
$ Variable Name     <str> 'Population, Total (Number)', 'Population, Total (Number)', 'Population, Total (Number)', 'Population, Total (Number)', 'Population, Total (Number)', 'Population, Total (Number)', 'Population, Total (Number)', 'Population, Total (Number)', 'Population, Total (Number)', 'Population, Total (Number)'
$ Age               <str> 'KN.C1', 'KN.C2', 'KN.C3', 'KN.C4', 'KN.C5', 'KN.C6', 'KN.C7', 'KN.C8', 'KN.C9', 'KN.C10'
$ Age Name          <str> '

In [32]:
census_raw.columns 

['Region',
 'Region Name',
 'Region - RegionId',
 'Variable',
 'Variable Name',
 'Age',
 'Age Name',
 'Scale',
 'Units',
 '2008']

In [21]:
# Import data by specifying columns of interest
census_raw2 =(
    pl.read_csv(
        path,
        null_values='NA',
        columns=['Region Name', 'Variable Name', 'Age Name', '2008']
    )
)

print(census_raw2.sample(15))

shape: (15, 4)
┌─────────────────────────┬─────────────────────────────┬──────────┬────────┐
│ Region Name             ┆ Variable Name               ┆ Age Name ┆ 2008   │
│ ---                     ┆ ---                         ┆ ---      ┆ ---    │
│ str                     ┆ str                         ┆ str      ┆ i64    │
╞═════════════════════════╪═════════════════════════════╪══════════╪════════╡
│ Northern Bahr el Ghazal ┆ Population, Total (Number)  ┆ 60 to 64 ┆ 11636  │
│ Upper Nile              ┆ Population, Male (Number)   ┆ 40 to 44 ┆ 22290  │
│ Unity                   ┆ Population, Female (Number) ┆ 20 to 24 ┆ 25687  │
│ Central Equatoria       ┆ Population, Female (Number) ┆ 50 to 54 ┆ 11201  │
│ Warrap                  ┆ Population, Female (Number) ┆ 55 to 59 ┆ 7015   │
│ …                       ┆ …                           ┆ …        ┆ …      │
│ Eastern Equatoria       ┆ Population, Female (Number) ┆ 30 to 34 ┆ 30734  │
│ Northern Bahr el Ghazal ┆ Population, Female (N

In [35]:
print(
    census_raw
    .select(cs.ends_with('Name'), '2008')
)

shape: (453, 4)
┌─────────────────────────────┬────────────────────────────┬──────────┬────────┐
│ Region Name                 ┆ Variable Name              ┆ Age Name ┆ 2008   │
│ ---                         ┆ ---                        ┆ ---      ┆ ---    │
│ str                         ┆ str                        ┆ str      ┆ i64    │
╞═════════════════════════════╪════════════════════════════╪══════════╪════════╡
│ Upper Nile                  ┆ Population, Total (Number) ┆ Total    ┆ 964353 │
│ Upper Nile                  ┆ Population, Total (Number) ┆ 0 to 4   ┆ 150872 │
│ Upper Nile                  ┆ Population, Total (Number) ┆ 5 to 9   ┆ 151467 │
│ Upper Nile                  ┆ Population, Total (Number) ┆ 10 to 14 ┆ 126140 │
│ Upper Nile                  ┆ Population, Total (Number) ┆ 15 to 19 ┆ 103804 │
│ …                           ┆ …                          ┆ …        ┆ …      │
│ Eastern Equatoria           ┆ Population, Female         ┆ 60 to 64 ┆ 5274   │
│           