In [1]:
import numpy as np

In [2]:
import statsmodels.api as sm

In [3]:
import statsmodels.formula.api as smf

In [4]:
data = sm.datasets.get_rdataset("Titanic", package="datasets")

df = data.data
df.tail()

Unnamed: 0,Class,Sex,Age,Survived,Freq
27,Crew,Male,Adult,Yes,192
28,1st,Female,Adult,Yes,140
29,2nd,Female,Adult,Yes,80
30,3rd,Female,Adult,Yes,76
31,Crew,Female,Adult,Yes,20


In [38]:
from statsmodels.datasets import macrodata, co2

In [13]:
import pandas as pd

In [8]:
dir(co2)

['COPYRIGHT',
 'DESCRLONG',
 'DESCRSHORT',
 'NOTE',
 'SOURCE',
 'TITLE',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'abspath',
 'data',
 'dirname',
 'du',
 'load',
 'load_pandas',
 'np',
 'pd']

## 데이터를 읽기

In [10]:
help(co2.load_pandas)

Help on function load_pandas in module statsmodels.datasets.co2.data:

load_pandas()



In [64]:
data  = co2.load().data

In [65]:
data.__class__

numpy.recarray

##  다차원 배열의 데이터  의 값을 변환

In [66]:
data['date']

array([b'19580329', b'19580405', b'19580412', ..., b'20011215',
       b'20011222', b'20011229'], dtype='|S8')

In [67]:
import numpy as np

In [72]:
help(np.datetime64)

Help on class datetime64 in module numpy:

class datetime64(generic)
 |  Base class for numpy scalar types.
 |  
 |  Class from which most (all?) numpy scalar types are derived.  For
 |  consistency, exposes the same API as `ndarray`, despite many
 |  consequent attributes being either "get-only," or completely irrelevant.
 |  This is the class from which it is strongly suggested users should derive
 |  custom scalar types.
 |  
 |  Method resolution order:
 |      datetime64
 |      generic
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __gt__(self, value, /)
 |      Return self>value.
 |  
 |  __hash__(self, /)
 |      Return hash(self).
 |  
 |  __le__(self, value, /)
 |      Return self<=value.
 |  
 |  __lt__(self, value, /)
 |      Return self<value.
 |  
 |  __ne__(self, value, /)
 |      Return self!=value.
 |  
 |  __repr__(self, /)
 |      R

In [70]:
foo_d = np.datetime64(data['date'])

ValueError: Could not convert object to NumPy datetime

In [52]:
foo = data['date'].astype('datetime64',copy=False)

In [53]:
foo

array(['19580329', '19580405', '19580412', ..., '20011215', '20011222',
       '20011229'], dtype='datetime64[Y]')

In [56]:
pd.DataFrame(foo).head()

OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 19580329-01-01 00:00:00

## 1. co2 데이터 로딩 해결방법 1

In [42]:
df_co2 = pd.DataFrame.from_records(co2.load().data)

In [43]:
df_co2.head()

Unnamed: 0,date,co2
0,b'19580329',316.1
1,b'19580405',317.3
2,b'19580412',317.6
3,b'19580419',317.5
4,b'19580426',316.4


###  함수 표현식으로 바이트 문자열을 유니코드 문자열로 변경

In [44]:
df_co2['date'] = df_co2.date.apply(lambda x: x.decode('utf-8'))

In [45]:
df_co2.head()

Unnamed: 0,date,co2
0,19580329,316.1
1,19580405,317.3
2,19580412,317.6
3,19580419,317.5
4,19580426,316.4


### 유니코드 문자열에서 날자를 변경하기

In [46]:
df_co2['date'] = pd.to_datetime(df_co2.date, format='%Y%m%d')

In [48]:
df_co2.head()

Unnamed: 0,date,co2
0,1958-03-29,316.1
1,1958-04-05,317.3
2,1958-04-12,317.6
3,1958-04-19,317.5
4,1958-04-26,316.4


In [49]:
df_co = df_co2.set_index('date')

In [51]:
df_co.head()

Unnamed: 0_level_0,co2
date,Unnamed: 1_level_1
1958-03-29,316.1
1958-04-05,317.3
1958-04-12,317.6
1958-04-19,317.5
1958-04-26,316.4


## 2. co2 데이터 로딩 해결방법 2

In [19]:
dta = co2.load().data

In [20]:
dta

rec.array([(b'19580329', 316.1), (b'19580405', 317.3),
           (b'19580412', 317.6), ..., (b'20011215', 371.2),
           (b'20011222', 371.3), (b'20011229', 371.5)],
          dtype=[('date', 'S8'), ('co2', '<f8')])

### 넘파이 배열을 바로 판다스 데이터프레임으로 변경

In [21]:
df = pd.DataFrame(dta)

In [22]:
df.head()

Unnamed: 0,date,co2
0,b'19580329',316.1
1,b'19580405',317.3
2,b'19580412',317.6
3,b'19580419',317.5
4,b'19580426',316.4


### 바이트 문자열을 유니코드 문자열로 변경

In [23]:
df['date'] = df.date.str.decode(encoding='utf-8')

In [24]:
df.head()

Unnamed: 0,date,co2
0,19580329,316.1
1,19580405,317.3
2,19580412,317.6
3,19580419,317.5
4,19580426,316.4


### 날짜 변경

In [28]:
df['date'] = pd.to_datetime(df['date']) 

In [29]:
df.head()

Unnamed: 0,date,co2
0,1958-03-29,316.1
1,1958-04-05,317.3
2,1958-04-12,317.6
3,1958-04-19,317.5
4,1958-04-26,316.4


In [30]:
df.tail()

Unnamed: 0,date,co2
2279,2001-12-01,370.3
2280,2001-12-08,370.8
2281,2001-12-15,371.2
2282,2001-12-22,371.3
2283,2001-12-29,371.5
