In [None]:
## Parsing functions in pandas
'''

functions are in ppt.
see Table 6-1. Parsing functions in pandas

'''
import pandas as pd
import numpy as np

rout = "/Users/jasonjr/Documents/Python/ex1.csv"

In [None]:
# 先製作一個csv檔來使用，內容長這樣
!cat /Users/jasonjr/Documents/Python/ex1.csv

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [None]:
df = pd.read_csv('/Users/jasonjr/Documents/Python/ex1.csv')
df

Unnamed: 0,1,2,3,4,hello
0,5,6,7,8,world
1,9,10,11,12,foo


In [None]:
pd.read_table('/Users/jasonjr/Documents/Python/ex1.csv', sep=',')

Unnamed: 0,1,2,3,4,hello
0,5,6,7,8,world
1,9,10,11,12,foo


In [None]:
# 消除檔案標頭
pd.read_csv('/Users/jasonjr/Documents/Python/ex1.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [None]:
# 定義檔案標頭
pd.read_csv('/Users/jasonjr/Documents/Python/ex1.csv', names=['a', 'b', 'c', 'd', 'message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [None]:
# 自訂column名稱

names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv(rout, names=names, index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [None]:
!cat examples/csv_mindex.csv

key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [None]:
# 跳過第0,2,3的rows
pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [None]:
'''

常用的讀取function.
Table 6-2. Some read_csv/read_table function arguments

'''

In [None]:
# Reading Text Files in Pieces
# result = pd.read_csv('examples/ex6.csv')
# result

In [None]:
## Writing Data to Text Format
data = pd.read_csv('examples/ex5.csv')
data

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [None]:
# 輸出檔案囉！
data.to_csv('examples/out.csv')
!cat examples/out.csv

,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [None]:
import sys
data.to_csv(sys.stdout, sep='|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [None]:
data.to_csv(sys.stdout, na_rep='NULL')

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo


In [None]:
data.to_csv(sys.stdout, index=False, header=False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [None]:
# Series 同樣也可以用to_csv的方法來進行存檔
dates = pd.date_range('1/1/2000', periods=7)
ts = pd.Series(np.arange(7), index=dates)
ts.to_csv('examples/tseries.csv')
!cat examples/tseries.csv

2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6


In [None]:
## Working with Delimited Formats
# 不一定每次你拿到的資料都是整理好的，有可能你必須要用其他的方法來改善
!cat examples/ex7.csv

"a","b","c"
"1","2","3"
"1","2","3"


In [None]:
import csv
f = open('examples/ex7.csv')

reader = csv.reader(f)
for line in reader:
    print(line)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']


In [None]:
# p.177
with open('examples/ex7.csv') as f:
    lines = list(csv.reader(f))
    header, values = lines[0], lines[1:]


In [None]:
# 再把你想要轉的格式轉出來
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

In [None]:
## JSON Data
#以下就是個Json範例

import json
obj = """
    {"name": "Wes",
     "places_lived": ["United States", "Spain", "Germany"],
     "pet": null,
     "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
                  {"name": "Katie", "age": 38,
                   "pets": ["Sixes", "Stache", "Cisco"]}]
} """

In [None]:
result = json.loads(obj)
result

{'name': 'Wes',
 'pet': None,
 'places_lived': ['United States', 'Spain', 'Germany'],
 'siblings': [{'age': 30, 'name': 'Scott', 'pets': ['Zeus', 'Zuko']},
  {'age': 38, 'name': 'Katie', 'pets': ['Sixes', 'Stache', 'Cisco']}]}

In [None]:
# json.dumps可以把python物件重新轉換為json file.
asjson = json.dumps(result)

In [None]:
siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])
siblings

Unnamed: 0,name,age
0,Scott,30
1,Katie,38


In [None]:
# 製作一個json file.
!cat examples/example.json

[{"a": 1, "b": 2, "c": 3},
{"a": 4, "b": 5, "c": 6},
{"a": 7, "b": 8, "c": 9}]


In [None]:
# 可以利用pandas裡面讀取json file的函式進行讀取
data = pd.read_json('examples/example.json')
data

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [None]:
print(data.to_json())

{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}


In [None]:
# 以a,b,c為主
print(data.to_json(orient='records'))

[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]


In [None]:
'''

接下來是爬蟲，就交給班森大師拉

'''