In [1]:
import numpy as pd 
import pandas as pd

* delimiter white space: `sep='\s+'` (e.g. ' ' or '\t')
* `index_col = False`

In [2]:
import pandas as pd
from io import StringIO
# file대신에 string을 사용하는 경우

data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3"

pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [3]:
pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ["COL1", "COL3"])

Unnamed: 0,col1,col3
0,a,1
1,a,2
2,c,3


In [12]:
pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0)
# index번호는 무의미하다. col label이 0 그다름부터 1,2,3

Unnamed: 0,col1,col2,col3
0,a,b,2


In [13]:
# Specifying column data types
import numpy as np

data = "a,b,c,d\n1,2,3,4\n5,6,7,8\n9,10,11"

print(data)

a,b,c,d
1,2,3,4
5,6,7,8
9,10,11


In [14]:
df = pd.read_csv(StringIO(data), dtype=object)
df

Unnamed: 0,a,b,c,d
0,1,2,3,4.0
1,5,6,7,8.0
2,9,10,11,


In [15]:
df['a'][0]

'1'

In [17]:
df = pd.read_csv(StringIO(data), dtype= {'b': object, 'c': np.float64, 'd': 'Int64'})
df.dtypes

a      int64
b     object
c    float64
d      Int64
dtype: object

In [18]:
data = "col_1\n1\n2\n'A'\n4.22"
df = pd.read_csv(StringIO(data), converters={"col_1": str})
df

Unnamed: 0,col_1
0,1
1,2
2,'A'
3,4.22


In [19]:
df["col_1"].apply(type).value_counts()

col_1
<class 'str'>    4
Name: count, dtype: int64

In [24]:
df2 = pd.read_csv(StringIO(data))
df2['col_1'] = pd.to_numeric(df2['col_1'], errors='coerce')
df2

Unnamed: 0,col_1
0,1.0
1,2.0
2,
3,4.22


In [25]:
data = """a,b,c,d,e,f,g,h,i,j
1,2.5,True,a,,,,,12-31-2019,
3,4.5,False,b,6,7.5,True,a,12-31-2019,
"""
# delimiter로 \n을 사용한다.
df = pd.read_csv(StringIO(data), dtype_backend="numpy_nullable", parse_dates=["i"])
df

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,1,2.5,True,a,,,,,2019-12-31,
1,3,4.5,False,b,6.0,7.5,True,a,2019-12-31,


In [26]:
df.dtypes

a             Int64
b           Float64
c           boolean
d    string[python]
e             Int64
f           Float64
g           boolean
h    string[python]
i    datetime64[ns]
j             Int64
dtype: object

In [27]:
data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3"

pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [28]:
pd.read_csv(StringIO(data)).dtypes

col1    object
col2    object
col3     int64
dtype: object

In [29]:
pd.read_csv(StringIO(data), dtype="category").dtypes

col1    category
col2    category
col3    category
dtype: object

In [30]:
# Unordered Categorical
pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes


col1    category
col2      object
col3       int64
dtype: object

In [31]:
# CategoricalDtype을 먼저 설정하면서, order를 줄 수 있다.
from pandas.api.types import CategoricalDtype
dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True)
pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes

col1    category
col2      object
col3       int64
dtype: object

In [35]:
dtype = CategoricalDtype(['a', 'b', 'd'])
pd.read_csv(StringIO(data), dtype={'col1': dtype}).col1
# category에 없는 요소는 missing values로 본다. 그래서 NaN

0      a
1      a
2    NaN
Name: col1, dtype: category
Categories (3, object): ['a', 'b', 'd']

In [36]:
df = pd.read_csv(StringIO(data), dtype='category')
df.dtypes

col1    category
col2    category
col3    category
dtype: object

In [37]:
df.col3

0    1
1    2
2    3
Name: col3, dtype: category
Categories (3, object): ['1', '2', '3']

In [38]:
new_categories = pd.to_numeric(df.col3.cat.categories)
# category의 구성 요소들인 categories를 numeric으로 형변환한다.
df.col3 = df.col3.cat.rename_categories(new_categories)

df.col3

0    1
1    2
2    3
Name: col3, dtype: category
Categories (3, int64): [1, 2, 3]

In [39]:
df.col3.cat.categories

Index([1, 2, 3], dtype='int64')

In [40]:
# Naming and using columns
data = "a,b,c\n1,2,3\n4,5,6\n7,8,9"
print(data)

a,b,c
1,2,3
4,5,6
7,8,9


In [41]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9
