# 第2章: UNIXコマンド

## 10. 行数のカウント

In [1]:
cnt = 0
with open("../data/popular-names.txt") as fr:
    for line in fr:
        cnt += 1
print(cnt)

2780


In [2]:
!wc "../data/popular-names.txt"

    2780   11120   55026 ../data/popular-names.txt


## 11. タブをスペースに置換

In [3]:
def head(text, n=5):
    text = text.split('\n')[:n]
    return '\n'.join(text)

In [4]:
with open("../data/popular-names.txt") as fr:
    data = fr.read()
    data = data.replace('\t', ' ')
print(head(data))

Mary F 7065 1880
Anna F 2604 1880
Emma F 2003 1880
Elizabeth F 1939 1880
Minnie F 1746 1880


In [5]:
!cat "../data/popular-names.txt" | tr "\t" ' ' | head -n 5

Mary F 7065 1880
Anna F 2604 1880
Emma F 2003 1880
Elizabeth F 1939 1880
Minnie F 1746 1880


## 12. 1列目をcol1.txtに，2列目をcol2.txtに保存

In [6]:
cnt = 0
with open("../data/popular-names.txt") as fr,\
     open("../data/col1.txt", "w") as fw1,\
     open("../data/col2.txt", "w") as fw2:
    for line in fr:
        line = line.rstrip().split('\t')
        print(line[0], file=fw1)
        print(line[1], file=fw2)  

In [7]:
!cat '../data/col1.txt' | head -n 5

Mary
Anna
Emma
Elizabeth
Minnie


In [8]:
!cut -f 1 -d $'\t' "../data/popular-names.txt" | head -n 5

Mary
Anna
Emma
Elizabeth
Minnie


In [9]:
!cat '../data/col2.txt' | head -n 5

F
F
F
F
F


In [10]:
!cut -f 2 -d $'\t' "../data/popular-names.txt" | head -n 5

F
F
F
F
F


## 13. col1.txtとcol2.txtをマージ

In [11]:
with open("../data/col1.txt") as fr1,\
     open("../data/col2.txt") as fr2,\
     open("../data/merge.txt", "w") as fw:
    for l1, l2 in zip(fr1, fr2):
        l1, l2 = l1.rstrip(), l2.rstrip()
        print("\t".join([l1, l2]), file=fw)

In [12]:
!cat '../data/merge.txt' | head -n 5

Mary	F
Anna	F
Emma	F
Elizabeth	F
Minnie	F


In [13]:
!paste '../data/col1.txt' '../data/col2.txt' | head -n 5

Mary	F
Anna	F
Emma	F
Elizabeth	F
Minnie	F


## 14. 先頭からN行を出力

In [14]:
def head(text, n=5):
    text = text.split('\n')[:n]
    return '\n'.join(text)

with open('../data/popular-names.txt') as fr:
    data = fr.read()
    print(head(data))

Mary	F	7065	1880
Anna	F	2604	1880
Emma	F	2003	1880
Elizabeth	F	1939	1880
Minnie	F	1746	1880


In [15]:
!head -n 5 '../data/popular-names.txt'

Mary	F	7065	1880
Anna	F	2604	1880
Emma	F	2003	1880
Elizabeth	F	1939	1880
Minnie	F	1746	1880


## 15. 末尾のN行を出力

In [16]:
def tail(text, n=5):
    text = text[:-1] if text[-1] == '\n' else text
    text = text.split('\n')[-n:]
    return '\n'.join(text)

with open('../data/popular-names.txt') as fr:
    data = fr.read()
    print(tail(data))

Benjamin	M	13381	2018
Elijah	M	12886	2018
Lucas	M	12585	2018
Mason	M	12435	2018
Logan	M	12352	2018


In [17]:
!tail -n 5 '../data/popular-names.txt'

Benjamin	M	13381	2018
Elijah	M	12886	2018
Lucas	M	12585	2018
Mason	M	12435	2018
Logan	M	12352	2018


## 16. ファイルをN分割する

In [18]:
def n_split(text, n=2):
    text = text[:-1] if text[-1] == '\n' else text
    text = text.split('\n')
    split_num = (len(text) // n) + 1
    for i in range(n):
        yield '\n'.join(text[:split_num])
        text = text[split_num:]

with open('../data/popular-names.txt') as fr:
    data = fr.read()
    splited = list(n_split(data))
    for s in splited:
        print(head(s))
        print("="*30)

Mary	F	7065	1880
Anna	F	2604	1880
Emma	F	2003	1880
Elizabeth	F	1939	1880
Minnie	F	1746	1880
Robert	M	83872	1949
John	M	81161	1949
William	M	61501	1949
Michael	M	60046	1949
David	M	59601	1949


In [19]:
# cannot `split -n` in MacOS
!split -n 2 '../data/popular-names.txt'

split: illegal option -- n
usage: split [-a sufflen] [-b byte_count] [-l line_count] [-p pattern]
             [file [prefix]]


## 17. １列目の文字列の異なり

In [20]:
with open('../data/col1.txt') as fr:
    data = [line.rstrip() for line in fr]
    data = '\n'.join(sorted(list(set(data))))
    print(head(data))

Abigail
Aiden
Alexander
Alexis
Alice


In [21]:
!cut -f 1 -d $'\t' "../data/popular-names.txt" | sort | uniq | head -n 5

Abigail
Aiden
Alexander
Alexis
Alice


## 18. 各行を3コラム目の数値の降順にソート

In [22]:
test = [[1, 2, 3], [3, 4, 5], [5, 1, 1]]
print(sorted(test, key=lambda x:x[0]))

[[1, 2, 3], [3, 4, 5], [5, 1, 1]]


In [23]:
from operator import itemgetter

# col2 - sorted as `String`
def sort_text(text, col=2):
    text = text[:-1] if text[-1] == '\n' else text
    text = [l.split('\t') for l in text.split('\n')]
    text = sorted(text, key=itemgetter(col), reverse=True)
    text = ['\t'.join(l) for l in text]
    return '\n'.join(text)

with open('../data/popular-names.txt') as fr:
    data = fr.read()
    print(head(sort_text(data)))

Linda	F	99689	1947
James	M	9951	1911
Mildred	F	9921	1913
Mary	F	9889	1886
Mary	F	9888	1887


In [24]:
# col2 - sorted as `Integer`
!sort -r -n -k 3,3 '../data/popular-names.txt' | head -n 5

Linda	F	99689	1947
Linda	F	96211	1948
James	M	94757	1947
Michael	M	92704	1957
Robert	M	91640	1947
sort: Broken pipe


## 19. 各行の1コラム目の文字列の出現頻度を求め，出現頻度の高い順に並べる

In [25]:
import collections
with open('../data/popular-names.txt') as fr:
    data = [line.rstrip().split('\t')[0] for line in fr]
    print(collections.Counter(data))

Counter({'James': 118, 'William': 111, 'John': 108, 'Robert': 108, 'Mary': 92, 'Charles': 75, 'Michael': 74, 'Elizabeth': 73, 'Joseph': 70, 'Margaret': 60, 'George': 58, 'Thomas': 58, 'David': 57, 'Richard': 51, 'Helen': 45, 'Frank': 43, 'Christopher': 43, 'Anna': 41, 'Edward': 40, 'Ruth': 39, 'Patricia': 38, 'Matthew': 37, 'Dorothy': 36, 'Emma': 35, 'Barbara': 32, 'Daniel': 31, 'Joshua': 31, 'Sarah': 26, 'Linda': 26, 'Jennifer': 26, 'Emily': 26, 'Jessica': 25, 'Jacob': 25, 'Mildred': 24, 'Betty': 24, 'Susan': 24, 'Henry': 23, 'Ashley': 23, 'Nancy': 22, 'Andrew': 21, 'Florence': 20, 'Marie': 20, 'Donald': 20, 'Amanda': 20, 'Samantha': 19, 'Karen': 18, 'Lisa': 18, 'Melissa': 18, 'Madison': 18, 'Olivia': 18, 'Stephanie': 17, 'Abigail': 17, 'Ethel': 16, 'Sandra': 16, 'Mark': 16, 'Frances': 15, 'Carol': 15, 'Angela': 15, 'Michelle': 15, 'Heather': 15, 'Ethan': 15, 'Isabella': 15, 'Shirley': 14, 'Kimberly': 14, 'Amy': 14, 'Ava': 14, 'Virginia': 13, 'Deborah': 13, 'Brian': 13, 'Jason': 13, '

In [26]:
!cut -f 1 -d $'\t' "../data/popular-names.txt" | sort | uniq -c | sort -nr | head -n 5

 118 James
 111 William
 108 Robert
 108 John
  92 Mary
