# 非常に高速なDataFrameライブラリー：Polars
Polarsのホームページ：https://www.pola.rs/

## 1.パッケージのインストール
pip installでpolarsをインストールします。

In [1]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## 2.基本的なDataFrameの作成：pl.DataFrame
まずは基本的なDataFrameの作成を行います。以下ではnumpyで行列データを生成して、それを用いてDataFrameを作成しています。

In [2]:
import polars as pl 
import numpy as np

data = np.arange(30)
data = data.reshape((6,5))

df = pl.DataFrame(data, columns = "12345")
df

  df = pl.DataFrame(data, columns = "12345")


1,2,3,4,5
i64,i64,i64,i64,i64
0,1,2,3,4
5,6,7,8,9
10,11,12,13,14
15,16,17,18,19
20,21,22,23,24
25,26,27,28,29


以下の例では辞書型のデータを作成して、そのデータからDataFrameを作成しています。

In [3]:
data = {
    "index": np.arange(1,6).tolist(),
    "data": ['one', 'two', 'three', 'four', 'five'],
    "data1":[float(num) for num in np.arange(1,6)]
}

df = pl.DataFrame(data)
df

index,data,data1
i64,str,f64
1,"""one""",1.0
2,"""two""",2.0
3,"""three""",3.0
4,"""four""",4.0
5,"""five""",5.0


## 3.DataFrameのcsv出力・読込

### csvファイル出力：write_csv()
write_csv()でDataFrameをcsv形式のデータとして出力します。

In [4]:
df.write_csv("output_data.csv")

### csvファイルを読み込んでDataFrameに格納：read_csv()
read_csv()でcsv形式のデータを読み込んでDataFrameに格納します。

In [5]:
df = pl.read_csv("output_data.csv")
df

index,data,data1
i64,str,f64
1,"""one""",1.0
2,"""two""",2.0
3,"""three""",3.0
4,"""four""",4.0
5,"""five""",5.0


## 4.DataFrameの一部を表示：head()、tail()、glimpse()
head()で先頭の5行、tail()で最後尾の5行をDataFrame形式で表示します。

glimpse()で先頭の10行を文字列として表示します。

In [102]:
data = np.arange(300).reshape(100, 3)
df = pl.DataFrame(data, "123")

# 先頭の5行を表示
df.head()

1,2,3
i64,i64,i64
0,1,2
3,4,5
6,7,8
9,10,11
12,13,14


In [7]:
# 最後の5行を表示
df.tail()

1,2,3
i64,i64,i64
285,286,287
288,289,290
291,292,293
294,295,296
297,298,299


In [101]:
# 各列の先頭の要素10個をテキストデータで表示
print(df.glimpse())
print('type:', type(df.glimpse()))

Rows: 100
Columns: 3
$ 1 <i64> 0, 3, 6, 9, 12, 15, 18, 21, 24, 27
$ 2 <i64> 1, 4, 7, 10, 13, 16, 19, 22, 25, 28
$ 3 <i64> 2, 5, 8, 11, 14, 17, 20, 23, 26, 29

type: <class 'str'>


## 5.列名の修正：alias()、prefix()、suffix()
列名に新しく別名を付ける際には、alias()を使用します。

列名の前後に任意の文字を付与したい場合には、.prefix()と.suffix()を使用します。

In [246]:
df = pl.DataFrame({
    "index": np.arange(1,6).tolist(),
    "data": ['one', 'two', 'three', 'four', 'five'],
    "data1":[float(num) for num in np.arange(1,6)]
})

df.select([
    pl.col('*'),
    (pl.col('data1') * 2).alias('data2'), # 新しい列名を指定
    pl.col('data1').prefix('Prefix_'),    # 元の列名の前に文字列を追加
    pl.col('data1').suffix('_Suffix')     # 元の列名の後に文字列を追加
])

index,data,data1,data2,Prefix_data1,data1_Suffix
i64,str,f64,f64,f64,f64
1,"""one""",1.0,2.0,1.0,1.0
2,"""two""",2.0,4.0,2.0,2.0
3,"""three""",3.0,6.0,3.0,3.0
4,"""four""",4.0,8.0,4.0,4.0
5,"""five""",5.0,10.0,5.0,5.0


## 6.要素の抽出

### 行を選択して抽出

In [10]:
# DataFrameの0行目を抽出
data = np.arange(300).reshape(100, 3)
df = pl.DataFrame(data, "123")
df[0]

1,2,3
i64,i64,i64
0,1,2


In [11]:
# DataFrameの0～5行目を抽出
df[0:5]

1,2,3
i64,i64,i64
0,1,2
3,4,5
6,7,8
9,10,11
12,13,14


### 列を選択して抽出：get_column()、select()

In [248]:
# DataFrameの0列目を抽出
data = np.arange(300).reshape(100, 3)
df = pl.DataFrame(data, "123")

df[:,0]

1
i64
0
3
6
9
12
15
18
21
24
27


In [250]:
# 列名を指定して列を抽出
df.get_column('1')

1
i64
0
3
6
9
12
15
18
21
24
27


In [127]:
data = {
    "index": np.arange(1,6).tolist(),
    "data": ['one', 'two', 'three', 'four', 'five'],
    "data1":[float(num) for num in np.arange(1,6)]
}

df = pl.DataFrame(data)

# ある1列を選択して抽出
df.select('index')

index
i64
1
2
3
4
5


In [14]:
# 複数の列を選択して抽出
df.select(['index', 'data1'])

index,data1
i64,f64
1,1.0
2,2.0
3,3.0
4,4.0
5,5.0


In [121]:
# index列を抽出
df.select(pl.col('index'))

index
i64
1
2
3
4
5


In [105]:
# index列を抽出 & index列を抽出してindex_modifyという名前に変更
# 新しく列名を付けたい場合は、".alias()"を使用
df.select([
    pl.col('index'),
    pl.col('index').alias('index_modify')
])

index,index_modify
i64,i64
1,1
2,2
3,3
4,4
5,5


In [15]:
# 列を除外して表示：以下では列名indexの列を除外
df.select([
    pl.exclude('index')
])

data,data1
str,f64
"""one""",1.0
"""two""",2.0
"""three""",3.0
"""four""",4.0
"""five""",5.0


### 行に対するフィルタリング：filter()

In [16]:
data = {
    "index": np.arange(1,6).tolist(),
    "data": ['one', 'two', 'three', 'four', 'five'],
    "data1":[float(num) for num in np.arange(1,6)]
}

df = pl.DataFrame(data)

In [17]:
# 列名"1"の列で値が10未満の行のみを抽出
df.filter(pl.col('index') < 3)

index,data,data1
i64,str,f64
1,"""one""",1.0
2,"""two""",2.0


In [18]:
# 複数条件の設定
df.filter((pl.col('index') < 3) & (pl.col('data') == 'one'))

index,data,data1
i64,str,f64
1,"""one""",1.0


In [19]:
# 複数条件の設定 & 列名を選択して抽出
df.filter((pl.col('index') < 3) & (pl.col('data') == 'one')).select("data1")

data1
f64
1.0


## 7.条件分岐：pl.when()

In [130]:
data = {
    "index": np.arange(1,6).tolist(),
    "data": ['one', 'two', 'three', 'four', 'five'],
    "data1":[float(num) for num in np.arange(1,6)]
}

df = pl.DataFrame(data)

In [129]:
# pl.when()で条件分岐
df.select(pl.when(pl.col('data1') > 3).then('over 2').otherwise(' under 2').alias('over2 or under2'))

over2 or under2
str
""" under 2"""
""" under 2"""
""" under 2"""
"""over 2"""
"""over 2"""


## 8.データの統計値を表示



### DataFrame全体の統計値を算出：describe()
以下のデータを表示

*   データ数（count）
*   欠損値の数（null_count）
*   平均値（mean）
*   標準偏差（std）
*   最小値（min）
*   最大値（max）
*   中央値（median）

In [124]:
data = np.arange(300).reshape(100, 3)
df = pl.DataFrame(data, "123")
df

1,2,3
i64,i64,i64
0,1,2
3,4,5
6,7,8
9,10,11
12,13,14
15,16,17
18,19,20
21,22,23
24,25,26
27,28,29


In [125]:
# 各列の最大値を算出
df.sum()

1,2,3
i64,i64,i64
14850,14950,15050


In [126]:
# 各列の平均値を算出
df.mean()

1,2,3
f64,f64,f64
148.5,149.5,150.5


In [100]:
df.describe()

describe,1,2,3
str,f64,f64,f64
"""count""",100.0,100.0,100.0
"""null_count""",0.0,0.0,0.0
"""mean""",148.5,149.5,150.5
"""std""",87.034476,87.034476,87.034476
"""min""",0.0,1.0,2.0
"""max""",297.0,298.0,299.0
"""median""",148.5,149.5,150.5


### 任意の列での統計値の算出：polars.sum()、polars.min()、polars.max()、polars.mean()
ある1列の統計値を算出

*   要素数：pl.col('{列名}').count()
*   合計値：pl.sum('{列名}')
*   最小値：pl.min('{列名}')
*   最大値：pl.max('{列名}') or  pl.col('{列名}').max()
*   平均値：pl.mean('{列名}')
*   標準偏差：pl.std('{列名}')
*   分散：pl.var('{列名}')

In [112]:
# ある一列での統計値を算出
# selectは次の章
df.select([
    pl.col('1').count().alias('count'),
    pl.sum('1').alias('sum'),
    pl.min('1').alias('min'),
    pl.max('1').alias('max'),
    pl.col('1').max().alias('other_max'),
    pl.mean('1').alias('mean'),
    pl.std('1').alias('std'),
    pl.var('1').alias('var'),
])

count,sum,min,max,other_max,mean,std,var
u32,i64,i64,i64,i64,f64,f64,f64
100,14850,0,297,297,148.5,87.034476,7575.0


## 9.列の追加：with_columns

In [90]:
data = {
    "index": np.arange(1,6).tolist(),
    "data": ['one', 'two', 'three', 'four', 'five'],
    "data1":[float(num) for num in np.arange(1,6)]
}

df = pl.DataFrame(data)
df

index,data,data1
i64,str,f64
1,"""one""",1.0
2,"""two""",2.0
3,"""three""",3.0
4,"""four""",4.0
5,"""five""",5.0


In [92]:
# with_columnsで列追加
df.with_columns([
    (pl.col('index') * 2).alias('index*2'),
    (pl.col('index') / 2).alias('index/2')
    ])

index,data,data1,index*2,index/2
i64,str,f64,i64,f64
1,"""one""",1.0,2,0.5
2,"""two""",2.0,4,1.0
3,"""three""",3.0,6,1.5
4,"""four""",4.0,8,2.0
5,"""five""",5.0,10,2.5


## 10.グループごとの抽出、計算：groupby()

In [113]:
data = {
    "Group": ['A', 'C', 'B', 'B', 'A', 'C', 'A', 'B', 'C', 'A'],
    "Num": np.arange(1,11).tolist()
}
df = pl.DataFrame(data)
df

Group,Num
str,i64
"""A""",1
"""C""",2
"""B""",3
"""B""",4
"""A""",5
"""C""",6
"""A""",7
"""B""",8
"""C""",9
"""A""",10


In [83]:
# Group列で各要素が何個あるかをカウント
df.groupby('Group').count()

Group,count
str,u32
"""A""",4
"""C""",3
"""B""",3


In [82]:
# Group列の各要素に対するNum数値の合計を算出
df.groupby('Group').sum()

Group,Num
str,i64
"""A""",23
"""C""",17
"""B""",15


In [84]:
# Group列で各要素が何個あるかをカウントして、各要素ごとのNum数値の合計を算出
df.groupby('Group').agg([
    pl.col('*').count().alias('count'),
    pl.col('*').sum().alias('sum')
])

Group,count,sum
str,u32,i64
"""A""",4,23
"""B""",3,15
"""C""",3,17


In [85]:
# Group列でソートをかける：.sort()

df.groupby('Group').agg([
    pl.col('*').count().alias('count'),
    pl.col('*').sum().alias('sum')
]).sort('Group')

Group,count,sum
str,u32,i64
"""A""",4,23
"""B""",3,15
"""C""",3,17


## 11.日付と時刻のデータ分析

### 時間データの生成：polars.date_range()

In [227]:
# 1日間隔の1年間の日付データ生成
df = pl.date_range(low=datetime(2023, 1, 1), high=datetime(2023, 12, 31), interval="1d", name="time")
df

time
datetime[μs]
2023-01-01 00:00:00
2023-01-02 00:00:00
2023-01-03 00:00:00
2023-01-04 00:00:00
2023-01-05 00:00:00
2023-01-06 00:00:00
2023-01-07 00:00:00
2023-01-08 00:00:00
2023-01-09 00:00:00
2023-01-10 00:00:00


In [229]:
# 1週間隔の1年間の日付データ生成
df = pl.date_range(low=datetime(2023, 1, 1), high=datetime(2023, 12, 31), interval="1w", name="time")
df

time
datetime[μs]
2023-01-01 00:00:00
2023-01-08 00:00:00
2023-01-15 00:00:00
2023-01-22 00:00:00
2023-01-29 00:00:00
2023-02-05 00:00:00
2023-02-12 00:00:00
2023-02-19 00:00:00
2023-02-26 00:00:00
2023-03-05 00:00:00


In [232]:
# 1月間隔の1年間の日付データ生成
df = pl.date_range(low=datetime(2023, 1, 1), high=datetime(2023, 12, 31), interval="1mo", name="time")
df

time
datetime[μs]
2023-01-01 00:00:00
2023-02-01 00:00:00
2023-03-01 00:00:00
2023-04-01 00:00:00
2023-05-01 00:00:00
2023-06-01 00:00:00
2023-07-01 00:00:00
2023-08-01 00:00:00
2023-09-01 00:00:00
2023-10-01 00:00:00


### 文字列の時刻データの生成：str.strptime()

In [150]:
df = pl.DataFrame({
    "index": np.arange(10),
    "Timestamp": [datetime(2023, 1, 1) + timedelta(days=idx) for idx in range(10)]
})
df

index,Timestamp
i64,datetime[μs]
0,2023-01-01 00:00:00
1,2023-01-02 00:00:00
2,2023-01-03 00:00:00
3,2023-01-04 00:00:00
4,2023-01-05 00:00:00
5,2023-01-06 00:00:00
6,2023-01-07 00:00:00
7,2023-01-08 00:00:00
8,2023-01-09 00:00:00
9,2023-01-10 00:00:00


In [160]:
df = pl.DataFrame({
    "index": np.arange(10),
    "Timestamp": ["2023-1-1", "2023-1-2", "2023-1-3", "2023-1-4", "2023-1-5", "2023-1-6", "2023-1-7", "2023-1-8", "2023-1-9", "2023-1-10"]
})
df

index,Timestamp
i64,str
0,"""2023-1-1"""
1,"""2023-1-2"""
2,"""2023-1-3"""
3,"""2023-1-4"""
4,"""2023-1-5"""
5,"""2023-1-6"""
6,"""2023-1-7"""
7,"""2023-1-8"""
8,"""2023-1-9"""
9,"""2023-1-10"""


In [162]:
df.select([
    pl.col('*'),
    pl.col('Timestamp').str.strptime(pl.Date, "%Y-%m-%d").alias('Date')
])

index,Timestamp,Date
i64,str,date
0,"""2023-1-1""",2023-01-01
1,"""2023-1-2""",2023-01-02
2,"""2023-1-3""",2023-01-03
3,"""2023-1-4""",2023-01-04
4,"""2023-1-5""",2023-01-05
5,"""2023-1-6""",2023-01-06
6,"""2023-1-7""",2023-01-07
7,"""2023-1-8""",2023-01-08
8,"""2023-1-9""",2023-01-09
9,"""2023-1-10""",2023-01-10


In [164]:
df = pl.DataFrame({
    "index": np.arange(4),
    "Timestamp": ["2023-1-1 09:00:00", "2023-1-2 09:00:00", "2023-1-3 09:00:00", "2023-1-4 09:00:00"]
})
df

index,Timestamp
i64,str
0,"""2023-1-1 09:00..."
1,"""2023-1-2 09:00..."
2,"""2023-1-3 09:00..."
3,"""2023-1-4 09:00..."


In [165]:
df.select([
    pl.col('*'),
    pl.col('Timestamp').str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S").alias('Datetime')
])

index,Timestamp,Datetime
i64,str,datetime[μs]
0,"""2023-1-1 09:00...",2023-01-01 09:00:00
1,"""2023-1-2 09:00...",2023-01-02 09:00:00
2,"""2023-1-3 09:00...",2023-01-03 09:00:00
3,"""2023-1-4 09:00...",2023-01-04 09:00:00


### 年月日、時間、四半期の要素を抽出：dt.year()、dt.month()、dt.day()
各要素の抽出方法の詳細は以下のリンクを確認

https://pola-rs.github.io/polars/py-polars/html/reference/series/timeseries.html

In [215]:
df = pl.DataFrame({
    "index": np.arange(4),
    "Timestamp": ["2023-1-1 09:00:00", "2023-1-2 09:00:00", "2023-1-3 09:00:00", "2023-1-4 09:00:00"]
})

df_date = df.select([
    pl.col('*'),
    pl.col('Timestamp').str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S").alias('Datetime')
])

df_date.select([
    pl.col('*'),
    pl.col('Datetime').dt.year().alias('Year'),
    pl.col('Datetime').dt.month().alias('Month'),
    pl.col('Datetime').dt.day().alias('Day'),
    pl.col('Datetime').dt.hour().alias('Hour'),
    pl.col('Datetime').dt.minute().alias('Minute'),
    pl.col('Datetime').dt.microsecond().alias('Microsecond'),
    pl.col('Datetime').dt.quarter().alias('Quarter')
])

index,Timestamp,Datetime,Year,Month,Day,Hour,Minute,Microsecond,Quarter
i64,str,datetime[μs],i32,u32,u32,u32,u32,u32,u32
0,"""2023-1-1 09:00...",2023-01-01 09:00:00,2023,1,1,9,0,0,1
1,"""2023-1-2 09:00...",2023-01-02 09:00:00,2023,1,2,9,0,0,1
2,"""2023-1-3 09:00...",2023-01-03 09:00:00,2023,1,3,9,0,0,1
3,"""2023-1-4 09:00...",2023-01-04 09:00:00,2023,1,4,9,0,0,1


### 日付列のフィルタリング：filter()

In [220]:
from datetime import datetime

df = pl.DataFrame({
    "index": np.arange(4),
    "Timestamp": ["2023-1-1", "2023-1-2", "2023-1-3", "2023-1-4"]
})

df_date = df.select([
    pl.col('*'),
    pl.col('Timestamp').str.strptime(pl.Date, "%Y-%m-%d").alias('Date')
])
df_date

index,Timestamp,Date
i64,str,date
0,"""2023-1-1""",2023-01-01
1,"""2023-1-2""",2023-01-02
2,"""2023-1-3""",2023-01-03
3,"""2023-1-4""",2023-01-04


In [223]:
# 指定した日付の行だけ抽出

df_date.filter(pl.col('Date') == datetime(2023,1,3))

index,Timestamp,Date
i64,str,date
2,"""2023-1-3""",2023-01-03


In [225]:
# 指定した日付の範囲の行だけ抽出：is_between()
# 1/2～1/3の行だけ抽出

df_date.filter(pl.col('Date').is_between(datetime(2023,1,2), datetime(2023,1,3)))

index,Timestamp,Date
i64,str,date
1,"""2023-1-2""",2023-01-02
2,"""2023-1-3""",2023-01-03


## 12.欠損値の処理

### 欠損値の有無確認：null_count()、is_null()

In [167]:
df = pl.DataFrame({
    "index": np.arange(6),
    "data": [0, None, 1, 2, None, 3]
})
df

index,data
i64,i64
0,0.0
1,
2,1.0
3,2.0
4,
5,3.0


In [168]:
# 欠損値の個数を算出：null_count()
df.null_count()

index,data
u32,u32
0,2


In [172]:
# dara列の要素が欠損しているか確認： is_null()
df.select([
    pl.col('*'),
    pl.col('data').is_null().alias('is_null')
])

index,data,is_null
i64,i64,bool
0,0.0,False
1,,True
2,1.0,False
3,2.0,False
4,,True
5,3.0,False


### 欠損箇所を埋める：fill_null()

In [182]:
# 指定した値で埋める：fill_null(pl.lit())
# 欠損値を2で埋める
df.select([
    pl.col('*'),
    pl.col('data').fill_null(pl.lit(2)).alias('fill_null') # 欠損値を2で埋めた列
])

index,data,fill_null
i64,i64,i64
0,0.0,0
1,,2
2,1.0,1
3,2.0,2
4,,2
5,3.0,3


In [183]:
# 統計値で埋める
# 平均値で埋める
df.select([
    pl.col('*'),
    pl.col('data').fill_null(pl.mean('data')).alias('fill_null(mean)') # 欠損値を列の平均値で埋めた列
])

index,data,fill_null(mean)
i64,i64,f64
0,0.0,0.0
1,,1.5
2,1.0,1.0
3,2.0,2.0
4,,1.5
5,3.0,3.0


In [185]:
# 補間で埋める
df.select([
    pl.col('*'),
    pl.col('data').interpolate().alias('interpolate') # 欠損値を補間で埋めた列
])

index,data,interpolate
i64,i64,i64
0,0.0,0
1,,0
2,1.0,1
3,2.0,2
4,,2
5,3.0,3


## 13.DataFrameの結合

### join()での結合

In [73]:
from datetime import datetime, timedelta

df1 = pl.DataFrame({
    "Index": np.arange(1,11),
    "Group": ['A', 'C', 'B', 'B', 'A', 'C', 'A', 'B', 'C', 'A']
    })

df2 =  pl.DataFrame({
    "Index": np.arange(1,11),
    "a": [datetime(2023, 1, 1) + timedelta(days=idx) for idx in range(10)],
    "b": np.random.rand(10),
    "c": [np.NaN if i %3 == 0 else i*2 for i in range(10)]
    })

In [64]:
df1

Index,Group
i64,str
1,"""A"""
2,"""C"""
3,"""B"""
4,"""B"""
5,"""A"""
6,"""C"""
7,"""A"""
8,"""B"""
9,"""C"""
10,"""A"""


In [65]:
df2

Index,a,b,c
i64,datetime[μs],f64,f64
1,2023-01-01 00:00:00,0.994982,
2,2023-01-02 00:00:00,0.079731,2.0
3,2023-01-03 00:00:00,0.748136,4.0
4,2023-01-04 00:00:00,0.100334,
5,2023-01-05 00:00:00,0.742517,8.0
6,2023-01-06 00:00:00,0.117552,10.0
7,2023-01-07 00:00:00,0.462989,
8,2023-01-08 00:00:00,0.279232,14.0
9,2023-01-09 00:00:00,0.282169,16.0
10,2023-01-10 00:00:00,0.290899,


In [63]:
# 結合: Indexを基準としてdf1とdf2を結合
# デフォルトでは内部結合（how = 'inner') ※2つのデータフレームに共通して存在するキーのみを保持
df1.join(df2, on=["Index"])

Index,Group,a,b,c
i64,str,datetime[μs],f64,f64
1,"""A""",2023-01-01 00:00:00,0.994982,
2,"""C""",2023-01-02 00:00:00,0.079731,2.0
3,"""B""",2023-01-03 00:00:00,0.748136,4.0
4,"""B""",2023-01-04 00:00:00,0.100334,
5,"""A""",2023-01-05 00:00:00,0.742517,8.0
6,"""C""",2023-01-06 00:00:00,0.117552,10.0
7,"""A""",2023-01-07 00:00:00,0.462989,
8,"""B""",2023-01-08 00:00:00,0.279232,14.0
9,"""C""",2023-01-09 00:00:00,0.282169,16.0
10,"""A""",2023-01-10 00:00:00,0.290899,


In [74]:
df1 = pl.DataFrame({
    "Index": np.arange(1,11),
    "Group": ['A', 'C', 'B', 'B', 'A', 'C', 'A', 'B', 'C', 'A']
    })

df2 =  pl.DataFrame({
    "Index": np.arange(1,11,2),
    "a": [datetime(2023, 1, 1) + timedelta(days=idx) for idx in range(5)],
    "b": np.random.rand(5),
    "c": [np.NaN if i %3 == 0 else i*2 for i in range(5)]
    })

In [75]:
# 結合: Indexを基準としてdf1とdf2を結合
# デフォルトでは内部結合（how = 'inner') ※2つのデータフレームに共通して存在するキーのみを保持
df1.join(df2, on=["Index"])

Index,Group,a,b,c
i64,str,datetime[μs],f64,f64
1,"""A""",2023-01-01 00:00:00,0.885355,
3,"""B""",2023-01-02 00:00:00,0.842026,2.0
5,"""A""",2023-01-03 00:00:00,0.050906,4.0
7,"""A""",2023-01-04 00:00:00,0.888924,
9,"""C""",2023-01-05 00:00:00,0.929552,8.0


In [77]:
# 結合: Indexを基準としてdf1とdf2を結合
# 左外部結合(how = 'left')
df1.join(df2, on=["Index"], how = 'left')

Index,Group,a,b,c
i64,str,datetime[μs],f64,f64
1,"""A""",2023-01-01 00:00:00,0.885355,
2,"""C""",,,
3,"""B""",2023-01-02 00:00:00,0.842026,2.0
4,"""B""",,,
5,"""A""",2023-01-03 00:00:00,0.050906,4.0
6,"""C""",,,
7,"""A""",2023-01-04 00:00:00,0.888924,
8,"""B""",,,
9,"""C""",2023-01-05 00:00:00,0.929552,8.0
10,"""A""",,,


In [76]:
# 結合: Indexを基準としてdf1とdf2を結合
# 完全外部結合(how = 'outer')
df1.join(df2, on=["Index"], how = 'outer')

Index,Group,a,b,c
i64,str,datetime[μs],f64,f64
1,"""A""",2023-01-01 00:00:00,0.885355,
2,"""C""",,,
3,"""B""",2023-01-02 00:00:00,0.842026,2.0
4,"""B""",,,
5,"""A""",2023-01-03 00:00:00,0.050906,4.0
6,"""C""",,,
7,"""A""",2023-01-04 00:00:00,0.888924,
8,"""B""",,,
9,"""C""",2023-01-05 00:00:00,0.929552,8.0
10,"""A""",,,


### polars.concatでの結合

In [244]:
# 列方向の結合（how='horizontal'）
df1 = pl.DataFrame({
    '1': [1, 2],
    '2': [3, 4]
    })

df2 = pl.DataFrame({
    '3': [5, 6],
    '4': [7, 8]
    })

pl.concat([df1, df2], how='horizontal')

1,2,3,4
i64,i64,i64,i64
1,3,5,7
2,4,6,8


In [245]:
# 結合: 各DataFrameの最左列（X1、X2）を基準としてdf1とdf2を結合
df1 = pl.DataFrame({
    "X1": np.arange(1,11),
    "Group": ['A', 'C', 'B', 'B', 'A', 'C', 'A', 'B', 'C', 'A']
    })

df2 =  pl.DataFrame({
    "X2": np.arange(1,11),
    "a": [datetime(2023, 1, 1) + timedelta(days=idx) for idx in range(10)],
    "b": np.random.rand(10),
    "c": [np.NaN if i %3 == 0 else i*2 for i in range(10)]
    })

pl.concat([df1, df2], how='horizontal')

X1,Group,X2,a,b,c
i64,str,i64,datetime[μs],f64,f64
1,"""A""",1,2023-01-01 00:00:00,0.632568,
2,"""C""",2,2023-01-02 00:00:00,0.018141,2.0
3,"""B""",3,2023-01-03 00:00:00,0.800396,4.0
4,"""B""",4,2023-01-04 00:00:00,0.931167,
5,"""A""",5,2023-01-05 00:00:00,0.691873,8.0
6,"""C""",6,2023-01-06 00:00:00,0.905154,10.0
7,"""A""",7,2023-01-07 00:00:00,0.855118,
8,"""B""",8,2023-01-08 00:00:00,0.804342,14.0
9,"""C""",9,2023-01-09 00:00:00,0.538047,16.0
10,"""A""",10,2023-01-10 00:00:00,0.179667,


In [241]:
# 行方向の結合（how='vertical'）
df1 = pl.DataFrame({
    '1': [1, 2],
    '2': [3, 4]
    })

df2 = pl.DataFrame({
    '1': [5, 6],
    '2': [7, 8]
    })

pl.concat([df1, df2], how='vertical')

1,2
i64,i64
1,3
2,4
5,7
6,8


In [242]:
# 対角結合（how='diagonal'）
df1 = pl.DataFrame({
    '1': [1, 2],
    '2': [3, 4]
    })

df2 = pl.DataFrame({
    '1': [3, 4],
    '3': [4, 5]
    })

pl.concat([df1, df2], how='diagonal')

1,2,3
i64,i64,i64
1,3.0,
2,4.0,
3,,4.0
4,,5.0
