In [45]:
import numpy as np
import pandas as pd

In [46]:
# Indexes
# Files with an "implicit" index column
data = "A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5"

print(data)

A,B,C
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5


In [47]:
with open('foo.csv', 'w') as f:
    f.write(data)

In [48]:
pd.read_csv('foo.csv')

Unnamed: 0,A,B,C
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5


In [49]:
# data를 parsing하기 위해
df = pd.read_csv('foo.csv', parse_dates=True)
df.index

DatetimeIndex(['2009-01-01', '2009-01-02', '2009-01-03'], dtype='datetime64[ns]', freq=None)

In [50]:
# Reading an index with a MultiIndex
data = 'year,indiv,zit,xit\n1977,"A",1.2,.6\n1977,"B",1.5,.5'
print(data)

year,indiv,zit,xit
1977,"A",1.2,.6
1977,"B",1.5,.5


In [51]:
with open('multi_ex.csv', 'w') as f:
    f.write(data)

In [52]:
df = pd.read_csv('multi_ex.csv', index_col=[0, 1])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,zit,xit
year,indiv,Unnamed: 2_level_1,Unnamed: 3_level_1
1977,A,1.2,0.6
1977,B,1.5,0.5


In [53]:
df.loc[1977]

Unnamed: 0_level_0,zit,xit
indiv,Unnamed: 1_level_1,Unnamed: 2_level_1
A,1.2,0.6
B,1.5,0.5


In [54]:
# Reading columns with Multindex
from pandas._testing import makeCustomDataframe as mkdf
df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
df.to_csv('mi.csv')
print(open('mi.csv').read())

C0,,C_l0_g0,C_l0_g1,C_l0_g2
C1,,C_l1_g0,C_l1_g1,C_l1_g2
C2,,C_l2_g0,C_l2_g1,C_l2_g2
C3,,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,,,
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2



In [55]:
pd.read_csv('mi.csv', header=[0, 1, 2, 3], index_col=[0, 1])

Unnamed: 0_level_0,C0,C_l0_g0,C_l0_g1,C_l0_g2
Unnamed: 0_level_1,C1,C_l1_g0,C_l1_g1,C_l1_g2
Unnamed: 0_level_2,C2,C_l2_g0,C_l2_g1,C_l2_g2
Unnamed: 0_level_3,C3,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2


In [56]:
data = ",a,a,a,b,c,c\n,q,r,s,t,u,v\none,1,2,3,4,5,6\ntwo,7,8,9,10,11,12"

print(data)

,a,a,a,b,c,c
,q,r,s,t,u,v
one,1,2,3,4,5,6
two,7,8,9,10,11,12


In [57]:
with open('mi2.csv', 'w') as f:
    f.write(data)

In [58]:
df = pd.read_csv('mi2.csv', header=[0, 1], index_col=0)
df

Unnamed: 0_level_0,a,a,a,b,c,c
Unnamed: 0_level_1,q,r,s,t,u,v
one,1,2,3,4,5,6
two,7,8,9,10,11,12


In [59]:
pd.read_csv('mi2.csv', header=[0, 1], index_col=0).columns

MultiIndex([('a', 'q'),
            ('a', 'r'),
            ('a', 's'),
            ('b', 't'),
            ('c', 'u'),
            ('c', 'v')],
           )

In [60]:
pd.read_csv('mi2.csv', header=[0, 1], index_col=0).index

Index(['one', 'two'], dtype='object')

In [61]:
df.to_csv('mi2_1.csv', index=False)

In [62]:
# Automatically 'sniffing' the delimiter
df = pd.DataFrame(np.random.randn(10,4))
df.to_csv('temp1.csv', sep=' ')
df.to_csv('temp2.csv', sep='/')

In [63]:
# sep=None으로 명시하면, read_csv()가 delimiter를 추정한다.
# 이는 실제 csv file이 delimiter로 무엇을 쓰고 있는지 와는 상관없다.   😵‍💫 
pd.read_csv('temp2.csv', sep=None, engine='python')

Unnamed: 0.1,Unnamed: 0,0,1,2,3
0,0,-0.936759,-1.035322,0.350667,-1.916628
1,1,-2.138034,-0.944415,-1.614262,-0.827492
2,2,-0.164965,1.554801,0.798699,0.020841
3,3,-0.000243,1.752495,-1.016497,-0.194266
4,4,-0.200088,0.520354,0.644823,0.943044
5,5,-0.680628,-0.098918,-1.252221,0.449551
6,6,0.008098,0.942639,-1.157703,-0.476416
7,7,1.214201,-0.945534,1.122962,-0.29227
8,8,0.605066,-0.597052,0.230812,-0.452598
9,9,-1.730847,-0.338147,-0.145,-0.108395


In [64]:
# Reading multiple files to create a single DataFrame (use concat())

In [65]:
# Iterating through files chunk by chunk
with pd.read_csv('temp1.csv', sep=" ", chunksize=4) as reader:
    reader
    for chunk in reader:
        print(chunk)

# "sep=  "을 명시해야한다.

   Unnamed: 0         0         1         2         3
0           0 -0.936759 -1.035322  0.350667 -1.916628
1           1 -2.138034 -0.944415 -1.614262 -0.827492
2           2 -0.164965  1.554801  0.798699  0.020841
3           3 -0.000243  1.752495 -1.016497 -0.194266
   Unnamed: 0         0         1         2         3
4           4 -0.200088  0.520354  0.644823  0.943044
5           5 -0.680628 -0.098918 -1.252221  0.449551
6           6  0.008098  0.942639 -1.157703 -0.476416
7           7  1.214201 -0.945534  1.122962 -0.292270
   Unnamed: 0         0         1         2         3
8           8  0.605066 -0.597052  0.230812 -0.452598
9           9 -1.730847 -0.338147 -0.145000 -0.108395


In [72]:
with pd.read_csv('temp2.csv', sep='/', iterator=True) as reader:
    # reader.get_chunk(5)
    for chunk in reader:
        print(chunk)


   Unnamed: 0         0         1         2         3
0           0 -0.936759 -1.035322  0.350667 -1.916628
1           1 -2.138034 -0.944415 -1.614262 -0.827492
2           2 -0.164965  1.554801  0.798699  0.020841
3           3 -0.000243  1.752495 -1.016497 -0.194266
4           4 -0.200088  0.520354  0.644823  0.943044
5           5 -0.680628 -0.098918 -1.252221  0.449551
6           6  0.008098  0.942639 -1.157703 -0.476416
7           7  1.214201 -0.945534  1.122962 -0.292270
8           8  0.605066 -0.597052  0.230812 -0.452598
9           9 -1.730847 -0.338147 -0.145000 -0.108395


In [78]:
df = pd.read_csv("cu.item", sep="\t")
df.head(20)

Unnamed: 0,item_code,item_name,display_level,selectable,sort_sequence
0,AA0,All items - old base,0,T,2
1,AA0R,Purchasing power of the consumer dollar - old ...,0,T,400
2,SA0,All items,0,T,1
3,SA0E,Energy,1,T,375
4,SA0L1,All items less food,1,T,359
5,SA0L12,All items less food and shelter,1,T,362
6,SA0L12E,"All items less food, shelter, and energy",1,T,363
7,SA0L12E4,"All items less food, shelter, energy, and used...",1,T,364
8,SA0L1E,All items less food and energy,1,T,360
9,SA0L2,All items less shelter,1,T,361


In [80]:
headers = {"User-Agent":'pandas'}
df = pd.read_csv('cu.item', sep='\t', storage_options=headers)

ValueError: storage_options passed with file object or non-fsspec file path

In [81]:
df = pd.read_json("s3://pandas-test/adatafile.json")

FileNotFoundError: The specified bucket does not exist

In [84]:
pd.read_csv(
    "s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/SaKe2013"
    "-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv",
    storage_options={"anon": True},
).to_csv("T085643.csv")

In [None]:
# Writing out data
# Writing to CSV format
# https://pandas.pydata.org/docs/user_guide/io.html#writing-out-data

# Writing a formatted string
# https://pandas.pydata.org/docs/user_guide/io.html#writing-a-formatted-string