# [第2章: UNIXコマンド](https://nlp100.github.io/ja/ch02.html)

In [1]:
import os
from pathlib import Path

import polars as pl


FILE_PATH = Path(os.getcwd()) / "popular-names.txt"

df = pl.read_csv(FILE_PATH, separator="\t", has_header=False)
df

column_1,column_2,column_3,column_4
str,str,i64,i64
"""Mary""","""F""",7065,1880
"""Anna""","""F""",2604,1880
"""Emma""","""F""",2003,1880
"""Elizabeth""","""F""",1939,1880
"""Minnie""","""F""",1746,1880
"""Margaret""","""F""",1578,1880
"""Ida""","""F""",1472,1880
"""Alice""","""F""",1414,1880
"""Bertha""","""F""",1320,1880
"""Sarah""","""F""",1288,1880


## 10. 行数のカウント

In [2]:
len(df)

2780

## 11. タブをスペースに置換

In [3]:
REPLACED_FILE = Path(os.getcwd()) / "replaced.txt"

df.write_csv(REPLACED_FILE, has_header=False, separator=" ")

## 12. 1列目をcol1.txtに，2列目をcol2.txtに保存

In [4]:
COLUMN_1_FILE = Path(os.getcwd()) / "col1.txt"
COLUMN_2_FILE = Path(os.getcwd()) / "col2.txt"

df[:, 0].to_frame().write_csv(COLUMN_1_FILE, has_header=False)
df[:, 1].to_frame().write_csv(COLUMN_2_FILE, has_header=False)

## 13. col1.txtとcol2.txtをマージ

In [5]:
MERGED_FILE = Path(os.getcwd()) / "merged.txt"

col1 = pl.read_csv(COLUMN_1_FILE, has_header=False)
col2 = pl.read_csv(COLUMN_2_FILE, has_header=False)

merged = col1.with_columns(col2[:, 0].alias("column_2"))
merged.write_csv(MERGED_FILE, has_header=False, separator="\t")

## 14. 先頭からN行を出力

In [6]:
df.head(5)

column_1,column_2,column_3,column_4
str,str,i64,i64
"""Mary""","""F""",7065,1880
"""Anna""","""F""",2604,1880
"""Emma""","""F""",2003,1880
"""Elizabeth""","""F""",1939,1880
"""Minnie""","""F""",1746,1880


## 15. 末尾のN行を出力

In [7]:
df.tail(5)

column_1,column_2,column_3,column_4
str,str,i64,i64
"""Benjamin""","""M""",13381,2018
"""Elijah""","""M""",12886,2018
"""Lucas""","""M""",12585,2018
"""Mason""","""M""",12435,2018
"""Logan""","""M""",12352,2018


## 16. ファイルをN分割する

In [8]:
N = 3

step = -(-len(df) // N)
for n in range(N):
    splitted = df[n*step:(n+1)*step]
    splitted.write_csv(Path(os.getcwd()) / f"split-{n}.txt", has_header=False, separator="\t")

## 17. １列目の文字列の異なり

In [9]:
df[:, 0].unique().sort()

column_1
str
"""Abigail"""
"""Aiden"""
"""Alexander"""
"""Alexis"""
"""Alice"""
"""Amanda"""
"""Amelia"""
"""Amy"""
"""Andrew"""
"""Angela"""


## 18. 各行を3コラム目の数値の降順にソート

In [10]:
df.sort(by=["column_3", "column_1"], descending=[True, False])

column_1,column_2,column_3,column_4
str,str,i64,i64
"""Linda""","""F""",99689,1947
"""Linda""","""F""",96211,1948
"""James""","""M""",94757,1947
"""Michael""","""M""",92704,1957
"""Robert""","""M""",91640,1947
"""Linda""","""F""",91016,1949
"""Michael""","""M""",90656,1956
"""Michael""","""M""",90517,1958
"""James""","""M""",88584,1948
"""Michael""","""M""",88528,1954


## 19. 各行の1コラム目の文字列の出現頻度を求め，出現頻度の高い順に並べる

In [11]:
df.join(df.group_by(by="column_1").count(), on="column_1") \
    .select(pl.col("count"), pl.col("column_1")) \
    .unique() \
    .sort(by=["count", "column_1"], descending=[True, True])

count,column_1
u32,str
118,"""James"""
111,"""William"""
108,"""Robert"""
108,"""John"""
92,"""Mary"""
75,"""Charles"""
74,"""Michael"""
73,"""Elizabeth"""
70,"""Joseph"""
60,"""Margaret"""
