In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
s = pd.Series([1, 3, 5, np.nan, 6, 8]) # 一维序列

In [4]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [6]:
dates = pd.date_range('20130101', periods=6) # 时间序列

In [7]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [8]:
df = pd.DataFrame(np.random.randn(6,4), 
                          index=dates, 
                          columns=list('ABCD')) # 二维数据框

In [9]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.642333,-0.77297,-0.823404,0.495468
2013-01-02,0.18538,-0.350632,0.220256,-0.082338
2013-01-03,-0.437222,-0.183861,-0.90164,0.518012
2013-01-04,0.108046,-1.118019,1.342771,-0.902833
2013-01-05,2.197607,1.352511,1.631912,0.098881
2013-01-06,0.421616,1.796068,-0.81189,1.328256


In [10]:
df2 = pd.DataFrame({'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo' })

In [11]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [12]:
df2.dtypes # 字段类型

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [14]:
df2.head(2) # 前两行

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo


In [15]:
df2.tail(3) # 后两行

Unnamed: 0,A,B,C,D,E,F
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [19]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [18]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [20]:
df.values

array([[ 0.64233272, -0.77297006, -0.82340364,  0.49546812],
       [ 0.18537978, -0.35063155,  0.22025568, -0.08233841],
       [-0.4372219 , -0.18386053, -0.9016397 ,  0.51801185],
       [ 0.10804587, -1.11801911,  1.34277136, -0.90283271],
       [ 2.19760726,  1.35251076,  1.6319119 ,  0.09888119],
       [ 0.42161594,  1.79606758, -0.81189005,  1.32825639]])

In [21]:
df.T # 转置

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.642333,0.18538,-0.437222,0.108046,2.197607,0.421616
B,-0.77297,-0.350632,-0.183861,-1.118019,1.352511,1.796068
C,-0.823404,0.220256,-0.90164,1.342771,1.631912,-0.81189
D,0.495468,-0.082338,0.518012,-0.902833,0.098881,1.328256


In [22]:
df.sort_values(by='B') # 按某一列排序

Unnamed: 0,A,B,C,D
2013-01-04,0.108046,-1.118019,1.342771,-0.902833
2013-01-01,0.642333,-0.77297,-0.823404,0.495468
2013-01-02,0.18538,-0.350632,0.220256,-0.082338
2013-01-03,-0.437222,-0.183861,-0.90164,0.518012
2013-01-05,2.197607,1.352511,1.631912,0.098881
2013-01-06,0.421616,1.796068,-0.81189,1.328256


In [23]:
df['A'] # 投影

2013-01-01    0.642333
2013-01-02    0.185380
2013-01-03   -0.437222
2013-01-04    0.108046
2013-01-05    2.197607
2013-01-06    0.421616
Freq: D, Name: A, dtype: float64

In [26]:
df[0:2] # 按行下标切片

Unnamed: 0,A,B,C,D
2013-01-01,0.642333,-0.77297,-0.823404,0.495468
2013-01-02,0.18538,-0.350632,0.220256,-0.082338


In [27]:
df['20130102':'20130104'] # 按索引切片

Unnamed: 0,A,B,C,D
2013-01-02,0.18538,-0.350632,0.220256,-0.082338
2013-01-03,-0.437222,-0.183861,-0.90164,0.518012
2013-01-04,0.108046,-1.118019,1.342771,-0.902833


In [28]:
df.loc[:,['A','B']] # 按列切片

Unnamed: 0,A,B
2013-01-01,0.642333,-0.77297
2013-01-02,0.18538,-0.350632
2013-01-03,-0.437222,-0.183861
2013-01-04,0.108046,-1.118019
2013-01-05,2.197607,1.352511
2013-01-06,0.421616,1.796068


In [29]:
df.loc['20130102':'20130104',['A','B']] # 同时按行列切片

Unnamed: 0,A,B
2013-01-02,0.18538,-0.350632
2013-01-03,-0.437222,-0.183861
2013-01-04,0.108046,-1.118019


In [41]:
df.loc['20130102']['A']

0.18537978096442437

In [39]:
df.iloc[0]['A']

0.6423327151848488

In [42]:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,0.108046,-1.118019
2013-01-05,2.197607,1.352511


In [43]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,0.18538,0.220256
2013-01-03,-0.437222,-0.90164
2013-01-05,2.197607,1.631912


In [44]:
df[df['A'] > 0] # 按条件进行筛选

Unnamed: 0,A,B,C,D
2013-01-01,0.642333,-0.77297,-0.823404,0.495468
2013-01-02,0.18538,-0.350632,0.220256,-0.082338
2013-01-04,0.108046,-1.118019,1.342771,-0.902833
2013-01-05,2.197607,1.352511,1.631912,0.098881
2013-01-06,0.421616,1.796068,-0.81189,1.328256


In [45]:
 df.groupby('A').sum() # 聚合

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-0.437222,-0.183861,-0.90164,0.518012
0.108046,-1.118019,1.342771,-0.902833
0.18538,-0.350632,0.220256,-0.082338
0.421616,1.796068,-0.81189,1.328256
0.642333,-0.77297,-0.823404,0.495468
2.197607,1.352511,1.631912,0.098881


In [46]:
df.groupby(['A','B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
-0.437222,-0.183861,-0.90164,0.518012
0.108046,-1.118019,1.342771,-0.902833
0.18538,-0.350632,0.220256,-0.082338
0.421616,1.796068,-0.81189,1.328256
0.642333,-0.77297,-0.823404,0.495468
2.197607,1.352511,1.631912,0.098881


In [47]:
news = pd.read_csv("news.csv")

In [48]:
news

Unnamed: 0,ID,score,title,url
0,1,1,Reverse engineering YouTube demonetization al...,https://docs.google.com/document/d/155yNpfR7dG...
1,2,9,Joplin A note-taking and to-do app with build...,https://github.com/laurent22/joplin/
2,3,4,Coinbase Ordered to Turn Over Identities of 1...,https://motherboard.vice.com/en_us/article/ywn...
3,4,8,Australian uses snack bags as Faraday cage to...,https://arstechnica.com/information-technology...
4,5,3,A blog I started on Neural Networks and Proba...,https://jontysinai.github.io
5,6,2,It Looks Like Nobel Economics Laureates Don't...,https://www.bloomberg.com/news/articles/2017-1...
6,7,7,Seventh RISC-V Workshop: Day Two - LowRISC,http://www.lowrisc.org/blog/2017/11/seventh-ri...
7,8,6,China's Art Factories: Van Gogh from the Swea...,"http://www.spiegel.de/international/0,1518,433..."
8,9,5,Judge Tells Uber Lawyer: 'It Looks Like You C...,https://www.nytimes.com/2017/11/29/business/wa...
9,10,0,"As a solo developer, I decided to offer phone...",http://plumshell.com/2017/11/30/as-a-solo-app-...
