# pandas概览
- 主要数据结构是 Series（一维数据）与 DataFrame（二维数据）
- 可以处理缺失值，表示为NaN
- 大小可变：插入或删除 DataFrame 等多维对象的列
- 熟的 IO 工具：读取文本文件（CSV 等支持分隔符的文件）、Excel 文件、数据库等来源的数据，利用超快的 HDF5 格式保存 / 加载数据

In [1]:
import pandas as pd
import numpy as np

# 生成对象

In [2]:
# 用值列表生成 Series 时，Pandas 默认自动生成整数索引
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
# 用含日期时间索引与标签的 NumPy 数组生成 DataFrame
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.733128,-1.738729,0.381403,1.135077
2013-01-02,-0.507044,0.851004,-1.426304,-0.359881
2013-01-03,0.458939,0.524491,0.016088,2.222337
2013-01-04,-0.019724,0.827771,0.522569,-0.534118
2013-01-05,-1.547823,1.091031,0.359625,0.056397
2013-01-06,0.573844,-0.115541,0.365709,0.154545


In [5]:
#用 Series 字典对象生成 DataFrame:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [6]:
# DataFrame 的列有不同数据类型
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

# 查看数据

In [7]:
# 查看头部数据
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-1.733128,-1.738729,0.381403,1.135077
2013-01-02,-0.507044,0.851004,-1.426304,-0.359881
2013-01-03,0.458939,0.524491,0.016088,2.222337
2013-01-04,-0.019724,0.827771,0.522569,-0.534118
2013-01-05,-1.547823,1.091031,0.359625,0.056397


In [8]:
# 查看尾部数据
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-0.019724,0.827771,0.522569,-0.534118
2013-01-05,-1.547823,1.091031,0.359625,0.056397
2013-01-06,0.573844,-0.115541,0.365709,0.154545


In [9]:
# 显示索引与列名：
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [10]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

- DataFrame.to_numpy 输出底层数据的 NumPy 对象
- DataFrame 的列由多种数据类型组成时，该操作耗费系统资源较大，这也是 Pandas 和 NumPy 的本质区别：NumPy 数组只有一种数据类型，DataFrame 每列的数据类型各不相同
- DataFrame.to_numpy() 的输出不包含行索引和列标签。

In [11]:
# df 这个 DataFrame 里的值都是浮点数，DataFrame.to_numpy() 的操作会很快，而且不复制数据
df.to_numpy()

array([[-1.73312787, -1.73872872,  0.38140298,  1.13507655],
       [-0.50704365,  0.85100401, -1.42630409, -0.35988092],
       [ 0.45893942,  0.52449131,  0.01608806,  2.22233719],
       [-0.0197243 ,  0.82777101,  0.52256857, -0.53411814],
       [-1.54782273,  1.09103115,  0.35962465,  0.05639672],
       [ 0.57384431, -0.11554103,  0.36570903,  0.1545452 ]])

In [12]:
# df2 这个 DataFrame 包含了多种类型，DataFrame.to_numpy() 操作就会耗费较多资源
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [13]:
# describe()可以快速查看数据的统计摘要
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.462489,0.240005,0.036515,0.445726
std,0.991314,1.055061,0.735965,1.046615
min,-1.733128,-1.738729,-1.426304,-0.534118
25%,-1.287628,0.044467,0.101972,-0.255812
50%,-0.263384,0.676131,0.362667,0.105471
75%,0.339273,0.845196,0.377479,0.889944
max,0.573844,1.091031,0.522569,2.222337


In [14]:
# 转置数据
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-1.733128,-0.507044,0.458939,-0.019724,-1.547823,0.573844
B,-1.738729,0.851004,0.524491,0.827771,1.091031,-0.115541
C,0.381403,-1.426304,0.016088,0.522569,0.359625,0.365709
D,1.135077,-0.359881,2.222337,-0.534118,0.056397,0.154545


In [15]:
# 按轴排序
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,1.135077,0.381403,-1.738729,-1.733128
2013-01-02,-0.359881,-1.426304,0.851004,-0.507044
2013-01-03,2.222337,0.016088,0.524491,0.458939
2013-01-04,-0.534118,0.522569,0.827771,-0.019724
2013-01-05,0.056397,0.359625,1.091031,-1.547823
2013-01-06,0.154545,0.365709,-0.115541,0.573844


In [16]:
# 按值排序
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-01,-1.733128,-1.738729,0.381403,1.135077
2013-01-06,0.573844,-0.115541,0.365709,0.154545
2013-01-03,0.458939,0.524491,0.016088,2.222337
2013-01-04,-0.019724,0.827771,0.522569,-0.534118
2013-01-02,-0.507044,0.851004,-1.426304,-0.359881
2013-01-05,-1.547823,1.091031,0.359625,0.056397


# 选择

In [17]:
# 选择单列，产生 Series，与 df.A 等效
df['A']

2013-01-01   -1.733128
2013-01-02   -0.507044
2013-01-03    0.458939
2013-01-04   -0.019724
2013-01-05   -1.547823
2013-01-06    0.573844
Freq: D, Name: A, dtype: float64

In [18]:
# 用 [ ] 切片行：
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-1.733128,-1.738729,0.381403,1.135077
2013-01-02,-0.507044,0.851004,-1.426304,-0.359881
2013-01-03,0.458939,0.524491,0.016088,2.222337


In [19]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.507044,0.851004,-1.426304,-0.359881
2013-01-03,0.458939,0.524491,0.016088,2.222337
2013-01-04,-0.019724,0.827771,0.522569,-0.534118


In [20]:
# 用标签提取一行数据
df.loc[dates[0]]

A   -1.733128
B   -1.738729
C    0.381403
D    1.135077
Name: 2013-01-01 00:00:00, dtype: float64

In [21]:
# 用标签选择多列数据：
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-1.733128,-1.738729
2013-01-02,-0.507044,0.851004
2013-01-03,0.458939,0.524491
2013-01-04,-0.019724,0.827771
2013-01-05,-1.547823,1.091031
2013-01-06,0.573844,-0.115541


In [22]:
# 用标签切片，包含行与列结束点：
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,-0.507044,0.851004
2013-01-03,0.458939,0.524491
2013-01-04,-0.019724,0.827771


In [23]:
# 返回对象降维
df.loc['20130102', ['A', 'B']]

A   -0.507044
B    0.851004
Name: 2013-01-02 00:00:00, dtype: float64

In [24]:
# 提取标量值：
df.loc[dates[0], 'A']

-1.7331278685112315

In [25]:
# 快速访问标量，与上述方法等效：
df.at[dates[0], 'A']

-1.7331278685112315

- 按位置选择

In [26]:
# 用整数位置选择
df.iloc[3]

A   -0.019724
B    0.827771
C    0.522569
D   -0.534118
Name: 2013-01-04 00:00:00, dtype: float64

In [27]:
# 类似 NumPy / Python，用整数切片
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.019724,0.827771
2013-01-05,-1.547823,1.091031


In [28]:
# 类似 NumPy / Python，用整数列表按位置切片
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,-0.507044,-1.426304
2013-01-03,0.458939,0.016088
2013-01-05,-1.547823,0.359625


In [29]:
# 显式整行切片
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,-0.507044,0.851004,-1.426304,-0.359881
2013-01-03,0.458939,0.524491,0.016088,2.222337


In [30]:
# 显式整列切片
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,-1.738729,0.381403
2013-01-02,0.851004,-1.426304
2013-01-03,0.524491,0.016088
2013-01-04,0.827771,0.522569
2013-01-05,1.091031,0.359625
2013-01-06,-0.115541,0.365709


In [31]:
# 显式提取值
df.iloc[1, 1]

0.851004013402816

In [32]:
# 快速访问标量，与上述方法等效
df.iat[1, 1]

0.851004013402816

## 布尔索引

In [33]:
# 用单列的值选择数据
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-03,0.458939,0.524491,0.016088,2.222337
2013-01-06,0.573844,-0.115541,0.365709,0.154545


In [34]:
# 选择 DataFrame 里满足条件的值
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,0.381403,1.135077
2013-01-02,,0.851004,,
2013-01-03,0.458939,0.524491,0.016088,2.222337
2013-01-04,,0.827771,0.522569,
2013-01-05,,1.091031,0.359625,0.056397
2013-01-06,0.573844,,0.365709,0.154545


In [35]:
# 用 isin() 筛选
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.733128,-1.738729,0.381403,1.135077,one
2013-01-02,-0.507044,0.851004,-1.426304,-0.359881,one
2013-01-03,0.458939,0.524491,0.016088,2.222337,two
2013-01-04,-0.019724,0.827771,0.522569,-0.534118,three
2013-01-05,-1.547823,1.091031,0.359625,0.056397,four
2013-01-06,0.573844,-0.115541,0.365709,0.154545,three


In [36]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.458939,0.524491,0.016088,2.222337,two
2013-01-05,-1.547823,1.091031,0.359625,0.056397,four


## 赋值

In [37]:
# 用索引自动对齐新增列的数据：
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [38]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.733128,-1.738729,0.381403,1.135077,
2013-01-02,-0.507044,0.851004,-1.426304,-0.359881,1.0
2013-01-03,0.458939,0.524491,0.016088,2.222337,2.0
2013-01-04,-0.019724,0.827771,0.522569,-0.534118,3.0
2013-01-05,-1.547823,1.091031,0.359625,0.056397,4.0
2013-01-06,0.573844,-0.115541,0.365709,0.154545,5.0


In [39]:
# 按标签赋值
df.at[dates[0], 'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-1.738729,0.381403,1.135077,
2013-01-02,-0.507044,0.851004,-1.426304,-0.359881,1.0
2013-01-03,0.458939,0.524491,0.016088,2.222337,2.0
2013-01-04,-0.019724,0.827771,0.522569,-0.534118,3.0
2013-01-05,-1.547823,1.091031,0.359625,0.056397,4.0
2013-01-06,0.573844,-0.115541,0.365709,0.154545,5.0


In [40]:
# 按位置赋值
df.iat[0, 1] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.381403,1.135077,
2013-01-02,-0.507044,0.851004,-1.426304,-0.359881,1.0
2013-01-03,0.458939,0.524491,0.016088,2.222337,2.0
2013-01-04,-0.019724,0.827771,0.522569,-0.534118,3.0
2013-01-05,-1.547823,1.091031,0.359625,0.056397,4.0
2013-01-06,0.573844,-0.115541,0.365709,0.154545,5.0


In [41]:
# 按 NumPy 数组赋值
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.381403,5,
2013-01-02,-0.507044,0.851004,-1.426304,5,1.0
2013-01-03,0.458939,0.524491,0.016088,5,2.0
2013-01-04,-0.019724,0.827771,0.522569,5,3.0
2013-01-05,-1.547823,1.091031,0.359625,5,4.0
2013-01-06,0.573844,-0.115541,0.365709,5,5.0


In [42]:
# 用 where 条件赋值：
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.381403,-5,
2013-01-02,-0.507044,-0.851004,-1.426304,-5,-1.0
2013-01-03,-0.458939,-0.524491,-0.016088,-5,-2.0
2013-01-04,-0.019724,-0.827771,-0.522569,-5,-3.0
2013-01-05,-1.547823,-1.091031,-0.359625,-5,-4.0
2013-01-06,-0.573844,-0.115541,-0.365709,-5,-5.0


# 缺失值
- Pandas 主要用 np.nan 表示缺失数据。 计算时，默认不包含空值
- 重建索引（reindex）可以更改、添加、删除指定轴的索引，并返回数据副本，即不更改原数据

In [43]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,0.381403,5,,1.0
2013-01-02,-0.507044,0.851004,-1.426304,5,1.0,1.0
2013-01-03,0.458939,0.524491,0.016088,5,2.0,
2013-01-04,-0.019724,0.827771,0.522569,5,3.0,


In [44]:
# 删除所有含缺失值的行：
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2013-01-02,-0.507044,0.851004,-1.426304,5,1.0,1.0


In [45]:
# 填充缺失值：
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,0.381403,5,5.0,1.0
2013-01-02,-0.507044,0.851004,-1.426304,5,1.0,1.0
2013-01-03,0.458939,0.524491,0.016088,5,2.0,5.0
2013-01-04,-0.019724,0.827771,0.522569,5,3.0,5.0


In [46]:
# 提取 nan 值的布尔掩码：
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


# 统计
- 一般情况下，运算时排除缺失值。

In [47]:
df.mean()

A   -0.173634
B    0.529793
C    0.036515
D    5.000000
F    3.000000
dtype: float64

In [48]:
# 在另一个轴(即，行)上执行同样的操作
df.mean(1)

2013-01-01    1.345351
2013-01-02    0.983531
2013-01-03    1.599904
2013-01-04    1.866123
2013-01-05    1.780567
2013-01-06    2.164802
Freq: D, dtype: float64

In [49]:
# 不同维度对象运算时，要先对齐。 此外，Pandas 自动沿指定维度广播。
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [50]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,-0.541061,-0.475509,-0.983912,4.0,1.0
2013-01-04,-3.019724,-2.172229,-2.477431,2.0,0.0
2013-01-05,-6.547823,-3.908969,-4.640375,0.0,-1.0
2013-01-06,,,,,


## Apply函数

In [51]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.381403,5,
2013-01-02,-0.507044,0.851004,-1.044901,10,1.0
2013-01-03,-0.048104,1.375495,-1.028813,15,3.0
2013-01-04,-0.067829,2.203266,-0.506244,20,6.0
2013-01-05,-1.615651,3.294297,-0.14662,25,10.0
2013-01-06,-1.041807,3.178756,0.219089,30,15.0


In [52]:
df.apply(lambda x: x.max() - x.min())

A    2.121667
B    1.206572
C    1.948873
D    0.000000
F    4.000000
dtype: float64

# 合并（Merge）
#### 结合（Concat）

In [53]:
# concat() 用于连接 Pandas 对象：
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,0.2333,0.13815,-1.003323,-1.922104
1,-1.147139,-0.653082,1.068974,0.795405
2,-0.817518,-2.635715,-0.442356,1.687568
3,0.398598,-0.287926,0.398225,-2.097414
4,-0.640004,0.833671,-1.067169,-0.326803
5,0.450302,0.65834,0.332202,0.097044
6,1.429483,0.668599,-0.219724,-0.469851
7,-0.852841,-0.550961,1.063397,0.875509
8,-0.382654,1.11026,-0.1047,1.332126
9,0.341171,2.276791,0.608752,-0.940232


In [54]:
# 分解为多组
pieces = [df[:3], df[3:7], df[7:]]
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,0.2333,0.13815,-1.003323,-1.922104
1,-1.147139,-0.653082,1.068974,0.795405
2,-0.817518,-2.635715,-0.442356,1.687568
3,0.398598,-0.287926,0.398225,-2.097414
4,-0.640004,0.833671,-1.067169,-0.326803
5,0.450302,0.65834,0.332202,0.097044
6,1.429483,0.668599,-0.219724,-0.469851
7,-0.852841,-0.550961,1.063397,0.875509
8,-0.382654,1.11026,-0.1047,1.332126
9,0.341171,2.276791,0.608752,-0.940232


#### 连接（join）

In [55]:
# SQL 风格的合并。 
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [56]:
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [57]:
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


#### 追加（Append）

In [58]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,-0.065422,0.457053,-1.624353,0.134555
1,-0.308215,-0.071855,-0.638223,0.759559
2,-1.098849,-1.276297,-0.342354,2.166478
3,0.228914,1.210185,-1.421891,-0.42401
4,0.34067,-1.174363,-1.636624,0.763863
5,-0.369826,1.031702,0.908719,0.889156
6,0.618391,-0.393494,-0.175755,-1.79853
7,1.472473,0.167461,0.65164,0.070426


In [59]:
s = df.iloc[3]
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,-0.065422,0.457053,-1.624353,0.134555
1,-0.308215,-0.071855,-0.638223,0.759559
2,-1.098849,-1.276297,-0.342354,2.166478
3,0.228914,1.210185,-1.421891,-0.42401
4,0.34067,-1.174363,-1.636624,0.763863
5,-0.369826,1.031702,0.908719,0.889156
6,0.618391,-0.393494,-0.175755,-1.79853
7,1.472473,0.167461,0.65164,0.070426
8,0.228914,1.210185,-1.421891,-0.42401


# 数据输入/输出

In [60]:
# 写入 CSV 文件
df.to_csv('foo.csv')

In [61]:
pd.read_csv('foo.csv')

Unnamed: 0.1,Unnamed: 0,A,B,C,D
0,0,-0.065422,0.457053,-1.624353,0.134555
1,1,-0.308215,-0.071855,-0.638223,0.759559
2,2,-1.098849,-1.276297,-0.342354,2.166478
3,3,0.228914,1.210185,-1.421891,-0.42401
4,4,0.34067,-1.174363,-1.636624,0.763863
5,5,-0.369826,1.031702,0.908719,0.889156
6,6,0.618391,-0.393494,-0.175755,-1.79853
7,7,1.472473,0.167461,0.65164,0.070426


In [62]:
df.to_hdf('foo.h5', 'df')

In [63]:
pd.read_hdf('foo.h5', 'df')

Unnamed: 0,A,B,C,D
0,-0.065422,0.457053,-1.624353,0.134555
1,-0.308215,-0.071855,-0.638223,0.759559
2,-1.098849,-1.276297,-0.342354,2.166478
3,0.228914,1.210185,-1.421891,-0.42401
4,0.34067,-1.174363,-1.636624,0.763863
5,-0.369826,1.031702,0.908719,0.889156
6,0.618391,-0.393494,-0.175755,-1.79853
7,1.472473,0.167461,0.65164,0.070426


In [64]:
# 写入 Excel 文件：
df.to_excel('foo.xlsx', sheet_name='Sheet1')

In [65]:
pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])

Unnamed: 0.1,Unnamed: 0,A,B,C,D
0,0,-0.065422,0.457053,-1.624353,0.134555
1,1,-0.308215,-0.071855,-0.638223,0.759559
2,2,-1.098849,-1.276297,-0.342354,2.166478
3,3,0.228914,1.210185,-1.421891,-0.42401
4,4,0.34067,-1.174363,-1.636624,0.763863
5,5,-0.369826,1.031702,0.908719,0.889156
6,6,0.618391,-0.393494,-0.175755,-1.79853
7,7,1.472473,0.167461,0.65164,0.070426


# numpy补充

In [66]:
#读取数据
world_alchohol = np.genfromtxt("/kaggle/input/world-alcohol/world_alcohol.txt",delimiter=",",dtype=str,encoding="utf-8")
print(type(world_alchohol))
print(world_alchohol)

<class 'numpy.ndarray'>
[['Year' 'WHO region' 'Country' 'Beverage Types' 'Display Value']
 ['1986' 'Western Pacific' 'Viet Nam' 'Wine' '0']
 ['1986' 'Americas' 'Uruguay' 'Other' '0.5']
 ...
 ['1987' 'Africa' 'Malawi' 'Other' '0.75']
 ['1989' 'Americas' 'Bahamas' 'Wine' '1.5']
 ['1985' 'Africa' 'Malawi' 'Spirits' '0.31']]
