### 什么是Series?

在pandas中，Series是一位容器，类似于Python的内置的列表，表示dataframe的每一列

In [1]:
import pandas as pd


In [9]:
s = pd.Series([1,2,3,4])
print(s)
print('-----------------------------------')
s1 = pd.Series(['whf',21])
print(s1)
#左边显示的‘行号’实际是Series的索引

print('-----------------------------------')
s = pd.Series(['Wes McKinney','Creator of Pandas'],
             index=['Person','Who'])
print(s)

0    1
1    2
2    3
3    4
dtype: int64
-----------------------------------
0    whf
1     21
dtype: object
-----------------------------------
Person         Wes McKinney
Who       Creator of Pandas
dtype: object


In [14]:
#创建DataFrame
family = pd.DataFrame({
    'Name':['whf1','whf2'],
    'Occupationa':['Student','Student'],
    'Born':['2005-01-01','2005-01-02'],
    'Award':['2010-01-01','2020-01-02'],
    'Age':[5,17]})
print(family)

   Name Occupationa        Born       Award  Age
0  whf1     Student  2005-01-01  2010-01-01    5
1  whf2     Student  2005-01-02  2020-01-02   17


In [25]:
#将name列作为行索引
family = pd.DataFrame({
    'Name':['whf1','whf2'],
    'Occupationa':['Student','Student'],
    'Born':['2005-01-01','2005-01-02'],
    'Award':['2010-01-01','2020-01-02'],
    'Age':[5,17]},
    index=['whf1','whf2'],
    columns=['Occupationa','Born','Award','Age']
)
print(family)

     Occupationa        Born       Award  Age
whf1     Student  2005-01-01  2010-01-01    5
whf2     Student  2005-01-02  2020-01-02   17


In [22]:
#将name列作为行索引,并且有序
from collections import OrderedDict
family = pd.DataFrame(OrderedDict([
    ('Name',['whf1','whf2']),
    ('Occupationa',['Student','Student']),
    ('Born',['2005-01-01','2005-01-02']),
    ('Award',['2010-01-01','2020-01-02']),
    ('Age',[5,17])])
)
print(family)

   Name Occupationa        Born       Award  Age
0  whf1     Student  2005-01-01  2010-01-01    5
1  whf2     Student  2005-01-02  2020-01-02   17


In [35]:
first_row=family.loc['whf1']
print(first_row)

print("\n")

print(first_row.index)
print(first_row.values)
print(first_row.keys())

Occupationa       Student
Born           2005-01-01
Award          2010-01-01
Age                     5
Name: whf1, dtype: object


Index(['Occupationa', 'Born', 'Award', 'Age'], dtype='object')
['Student' '2005-01-01' '2010-01-01' 5]
Index(['Occupationa', 'Born', 'Award', 'Age'], dtype='object')


In [37]:
scientists = pd.read_csv("../data/scientists.csv")

In [39]:
#读取数据表中的年龄信息
ages = scientists['Age']
print(ages)

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64


In [41]:
#获取年龄的基本统计量
print(ages.describe())

count     8.000000
mean     59.125000
std      18.325918
min      37.000000
25%      44.000000
50%      58.500000
75%      68.750000
max      90.000000
Name: Age, dtype: float64


### 操作自动对其和向量化（广播）

1. 同长度向量 print(ages+ages)

2. 向量和整数（标量）运算

3. 不同长度向量的运算 print(ages+pd.Series([1,100])),对于其他类型，shape必须匹配

4. 带有常见索引标签的向量（自动对齐）



In [43]:
#选择DataFrame的子集
print(scientists[scientists['Age']>scientists['Age'].mean()])

                   Name        Born        Died  Age     Occupation
1        William Gosset  1876-06-13  1937-10-16   61   Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90          Nurse
3           Marie Curie  1867-11-07  1934-07-04   66        Chemist
7          Johann Gauss  1777-04-30  1855-02-23   77  Mathematician


In [44]:
first_half = scientists[:4]
print(first_half)

                   Name        Born        Died  Age    Occupation
0     Rosaline Franklin  1920-07-25  1958-04-16   37       Chemist
1        William Gosset  1876-06-13  1937-10-16   61  Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90         Nurse
3           Marie Curie  1867-11-07  1934-07-04   66       Chemist


### 更改Series 和 DataFrame


In [51]:
#scientists中的Born列，属性为object，修改为datatime
print(scientists['Born'].dtype)
print("\n")
born_datatime = pd.to_datetime(scientists['Born'],format='%Y-%m-%d')
print(born_datatime)
died_datatime = pd.to_datetime(scientists['Died'],format='%Y-%m-%d')

object


0   1920-07-25
1   1876-06-13
2   1820-05-12
3   1867-11-07
4   1907-05-27
5   1813-03-15
6   1912-06-23
7   1777-04-30
Name: Born, dtype: datetime64[ns]


In [55]:
#新建一组新列
scientists['born_dt'],scientists['died_dt'] = (born_datatime,died_datatime)
print(scientists.head())
print(scientists.shape)

                   Name        Born        Died  Age    Occupation    born_dt  \
0     Rosaline Franklin  1920-07-25  1958-04-16   37       Chemist 1920-07-25   
1        William Gosset  1876-06-13  1937-10-16   61  Statistician 1876-06-13   
2  Florence Nightingale  1820-05-12  1910-08-13   90         Nurse 1820-05-12   
3           Marie Curie  1867-11-07  1934-07-04   66       Chemist 1867-11-07   
4         Rachel Carson  1907-05-27  1964-04-14   56     Biologist 1907-05-27   

     died_dt  
0 1958-04-16  
1 1937-10-16  
2 1910-08-13  
3 1934-07-04  
4 1964-04-14  
(8, 7)


将Age列的数据打乱

In [58]:
import random
random.seed(42)
random.shuffle(scientists['Age'])
print(scientists['Age'])

0    77
1    90
2    37
3    61
4    41
5    45
6    66
7    56
Name: Age, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x[i], x[j] = x[j], x[i]


In [61]:
scientists['Age'] = scientists['Age'].\
    sample(len(scientists['Age']),random_state=24).\
    reset_index(drop=True)
print(scientists['Age'])

0    66
1    56
2    41
3    45
4    77
5    37
6    90
7    61
Name: Age, dtype: int64


In [66]:
#假设scientists中的age出错，求出真实的年龄
print(scientists.head())
print("\n")
scientists['age_days_dt'] = (scientists['died_dt']-scientists['born_dt'])
print(scientists.head())
print("\n")
scientists['age_days_dt'] = scientists['age_days_dt'].astype('timedelta64[Y]')
print(scientists.head())

                   Name        Born        Died  Age    Occupation    born_dt  \
0     Rosaline Franklin  1920-07-25  1958-04-16   66       Chemist 1920-07-25   
1        William Gosset  1876-06-13  1937-10-16   56  Statistician 1876-06-13   
2  Florence Nightingale  1820-05-12  1910-08-13   41         Nurse 1820-05-12   
3           Marie Curie  1867-11-07  1934-07-04   45       Chemist 1867-11-07   
4         Rachel Carson  1907-05-27  1964-04-14   77     Biologist 1907-05-27   

     died_dt age_days_dt  
0 1958-04-16  13779 days  
1 1937-10-16  22404 days  
2 1910-08-13  32964 days  
3 1934-07-04  24345 days  
4 1964-04-14  20777 days  


                   Name        Born        Died  Age    Occupation    born_dt  \
0     Rosaline Franklin  1920-07-25  1958-04-16   66       Chemist 1920-07-25   
1        William Gosset  1876-06-13  1937-10-16   56  Statistician 1876-06-13   
2  Florence Nightingale  1820-05-12  1910-08-13   41         Nurse 1820-05-12   
3           Marie Curie  

In [67]:
test = scientists


In [70]:
print(test)

                   Name        Born        Died   Age          Occupation  \
0     Rosaline Franklin  1920-07-25  1958-04-16  37.0             Chemist   
1        William Gosset  1876-06-13  1937-10-16  61.0        Statistician   
2  Florence Nightingale  1820-05-12  1910-08-13  90.0               Nurse   
3           Marie Curie  1867-11-07  1934-07-04  66.0             Chemist   
4         Rachel Carson  1907-05-27  1964-04-14  56.0           Biologist   
5             John Snow  1813-03-15  1858-06-16  45.0           Physician   
6           Alan Turing  1912-06-23  1954-06-07  41.0  Computer Scientist   
7          Johann Gauss  1777-04-30  1855-02-23  77.0       Mathematician   

     born_dt    died_dt  age_days_dt  
0 1920-07-25 1958-04-16         37.0  
1 1876-06-13 1937-10-16         61.0  
2 1820-05-12 1910-08-13         90.0  
3 1867-11-07 1934-07-04         66.0  
4 1907-05-27 1964-04-14         56.0  
5 1813-03-15 1858-06-16         45.0  
6 1912-06-23 1954-06-07         4

In [69]:
test['Age'] = test['age_days_dt']

### 导入导出数据

In [71]:
names = scientists['Name']
print(names)

0       Rosaline Franklin
1          William Gosset
2    Florence Nightingale
3             Marie Curie
4           Rachel Carson
5               John Snow
6             Alan Turing
7            Johann Gauss
Name: Name, dtype: object


In [80]:
#保存
names.to_pickle('../output/scientists_names_series.pickle')
scientists.to_pickle('../output/scientists_df.pickle')
import glob
glob.glob('..//output//*')

['..//output\\scientists_df.pickle',
 '..//output\\scientists_names_series.pickle',
 '..//output\\南丁格尔玫瑰图.html']

In [82]:
#读取pickle,其扩展名可以是.p .pl .pickle
scientists_names_from_pickle = pd.read_pickle('../output/scientists_names_series.pickle')
print(scientists_names_from_pickle)

0       Rosaline Franklin
1          William Gosset
2    Florence Nightingale
3             Marie Curie
4           Rachel Carson
5               John Snow
6             Alan Turing
7            Johann Gauss
Name: Name, dtype: object
