In [1]:
import pandas as pd

# 1 Series 详解

## 1.1 创建 Series

### 1.1.1创建 Series 的最简单方法是传入一个Python列表

In [2]:
s = pd.Series(['banana',42])
print(s)
print(type(s))

0    banana
1        42
dtype: object
<class 'pandas.core.series.Series'>


In [3]:
s = pd.Series([52,42])
print(s)
print(type(s))

0    52
1    42
dtype: int64
<class 'pandas.core.series.Series'>


In [4]:
s = pd.Series(['banana','apple'])
print(s)
print(type(s))

0    banana
1     apple
dtype: object
<class 'pandas.core.series.Series'>


### 上面的结果中，左边显示的0，1是 Series 的行标签，默认为0，1，2，3...

### 1.1.2创建 Series 时，也可以通过 index 参数来指定行标签

In [5]:
s = pd.Series(['smart',18],index=['name','age'])
print(s)
print(type(s))

name    smart
age        18
dtype: object
<class 'pandas.core.series.Series'>


## 1.2 Series常用操作

### 1.2.1 加载scientist.csv数据集，并获取Age列的数据

In [6]:
scientists = pd.read_csv('./data/scientists.csv')
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [7]:
# 获取Age列的数据
age_series = scientists['Age']
print(age_series)
print(type(age_series))

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64
<class 'pandas.core.series.Series'>


In [8]:
# 查看Series的数据的形状
age_series.shape

(8,)

In [9]:
# 查看Series数据的个数
age_series.size

8

In [10]:
# 获取Series数据的行标签
age_series.index

RangeIndex(start=0, stop=8, step=1)

In [11]:
# 获取Series数据的元素值
age_series.values

array([37, 61, 90, 66, 56, 45, 41, 77], dtype=int64)

In [12]:
# 获取Series数据的行标签，和s.index效果相同
age_series.keys()

RangeIndex(start=0, stop=8, step=1)

In [13]:
# 根据行标签获取Series中的某个元素数据[行标签]
age_series.loc[0]

37

In [14]:
# 根据行位置获取Series中的某个元素数据[列标签]
age_series.iloc[1]

61

In [15]:
# 查看Series元素的类型
age_series.dtypes

dtype('int64')

### 常用统计方法

In [16]:
# 计算年龄的平均值
age_series.mean()

59.125

In [17]:
# 计算年龄的最大值
age_series.max()

90

In [18]:
# 计算年龄的最小值
age_series.min()

37

In [19]:
# 计算年龄的标准差
age_series.std()

18.325918413937288

In [20]:
age_series.value_counts

<bound method IndexOpsMixin.value_counts of 0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64>

In [21]:
# 获取职业这一列数据
occupation_series = scientists['Occupation']
print(occupation_series)
occupation_series.value_counts

0               Chemist
1          Statistician
2                 Nurse
3               Chemist
4             Biologist
5             Physician
6    Computer Scientist
7         Mathematician
Name: Occupation, dtype: object


<bound method IndexOpsMixin.value_counts of 0               Chemist
1          Statistician
2                 Nurse
3               Chemist
4             Biologist
5             Physician
6    Computer Scientist
7         Mathematician
Name: Occupation, dtype: object>

In [22]:
age_series.count()

8

In [23]:
# age_series是数值型数据
age_series.describe()

count     8.000000
mean     59.125000
std      18.325918
min      37.000000
25%      44.000000
50%      58.500000
75%      68.750000
max      90.000000
Name: Age, dtype: float64

In [24]:
# occupation_series是非数值型数据
occupation_series.describe()

count           8
unique          7
top       Chemist
freq            2
Name: Occupation, dtype: object

## 1.3 bool索引

In [25]:
bool_values = [False,True,True,True,False,False,False,True]
age_series[bool_values]

1    61
2    90
3    66
7    77
Name: Age, dtype: int64

In [26]:
age_series>age_series.mean()

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool

In [27]:
age_series[age_series>age_series.mean()]

1    61
2    90
3    66
7    77
Name: Age, dtype: int64

## 1.4 Series运算

In [28]:
age_series

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

In [29]:
# 加法
age_series + 100

0    137
1    161
2    190
3    166
4    156
5    145
6    141
7    177
Name: Age, dtype: int64

In [30]:
# 乘法
age_series * 2

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

### Serie和另一个Series运算

In [31]:
# 加法
age_series + age_series

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

In [32]:
# 乘法
age_series * age_series

0    1369
1    3721
2    8100
3    4356
4    3136
5    2025
6    1681
7    5929
Name: Age, dtype: int64

In [33]:
# 创建新的Series数据
new_series = pd.Series([1,100])
new_series

0      1
1    100
dtype: int64

In [34]:
# 两个Series相加
age_series + new_series

0     38.0
1    161.0
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
dtype: float64

In [35]:
# 两个Series相乘
age_series * new_series

0      37.0
1    6100.0
2       NaN
3       NaN
4       NaN
5       NaN
6       NaN
7       NaN
dtype: float64

# 2 DataFrame详解

## 2.1 创建DataFrame

In [36]:
peoples = pd.DataFrame({
    'Name':['Smart','David'],
    'Occupation':['Teacher','IT Engineer'],
    'Age':[18,30]
})
peoples

Unnamed: 0,Name,Occupation,Age
0,Smart,Teacher,18
1,David,IT Engineer,30


In [37]:
peoples = pd.DataFrame({
    'Occupation': ['Teacher', 'IT Engineer'],
    'Age': [18, 30]
}, columns=['Age', 'Occupation'], index=['Smart', 'David'])
peoples

Unnamed: 0,Age,Occupation
Smart,18,Teacher
David,30,IT Engineer


In [38]:
peoples = pd.DataFrame([
    ['Teacher', 18],
    ['IT Engineer', 30]
], columns=['Occupation', 'Age'], index=['Smart', 'David'])
peoples

Unnamed: 0,Occupation,Age
Smart,Teacher,18
David,IT Engineer,30


## DataFrame 常用操作

In [39]:
# 查看 DataFrame 数据的形状
scientists.shape

(8, 5)

In [40]:
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [41]:
# 查看DataFrame的元素总个数
scientists.size

40

In [42]:
#查看DataFrame的维度
scientists.ndim

2

In [43]:
# 获取DataFrame数据的行数
len(scientists)

8

In [44]:
# 获取DataFrame的行标签
scientists.index

RangeIndex(start=0, stop=8, step=1)

In [45]:
# 获取DataFrame的列标签
scientists.columns

Index(['Name', 'Born', 'Died', 'Age', 'Occupation'], dtype='object')

In [46]:
# 查看 DataFrame 每列数据元素的类型
scientists.dtypes

Name          object
Born          object
Died          object
Age            int64
Occupation    object
dtype: object

In [47]:
# 查看 DataFrame 每列的结构
scientists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        8 non-null      object
 1   Born        8 non-null      object
 2   Died        8 non-null      object
 3   Age         8 non-null      int64 
 4   Occupation  8 non-null      object
dtypes: int64(1), object(4)
memory usage: 448.0+ bytes


In [48]:
# 获取 DataFrame 的前 n 行数据，n 默认为 5
scientists.head()

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist


In [49]:
# 获取 DataFrame 的后 n 行数据，n 默认为 5
scientists.tail()

Unnamed: 0,Name,Born,Died,Age,Occupation
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


### 常用统计方法

In [50]:
# 计算DataFrame数据中每一列的最大值
scientists.max()

Name          William Gosset
Born              1920-07-25
Died              1964-04-14
Age                       90
Occupation      Statistician
dtype: object

In [51]:
# 计算DataFrame数据中每一列的最小值
scientists.min()

Name          Alan Turing
Born           1777-04-30
Died           1855-02-23
Age                    37
Occupation      Biologist
dtype: object

In [52]:
# 统计 DataFrame 数据中每列非空(NaN)元素的个数
scientists.count()

Name          8
Born          8
Died          8
Age           8
Occupation    8
dtype: int64

In [53]:
# 显示 DataFrame 数据中每列元素的各种统计值
# describe方法只显示数值型列的统计信息，可以通过include参数设置显示非数值型列的统计信息
scientists.describe()

Unnamed: 0,Age
count,8.0
mean,59.125
std,18.325918
min,37.0
25%,44.0
50%,58.5
75%,68.75
max,90.0


In [54]:
# 导入numpy包
import numpy as np

In [55]:
scientists.describe(include=[np.object_])

Unnamed: 0,Name,Born,Died,Occupation
count,8,8,8,8
unique,8,8,8,7
top,Rosaline Franklin,1920-07-25,1958-04-16,Chemist
freq,1,1,1,2


In [56]:
bool_values = [False, True, True, True, False, False, False, True]
scientists[bool_values]

Unnamed: 0,Name,Born,Died,Age,Occupation
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [57]:
# 应用：获取 scientists 中 Age 大于平均值的科学家信息
scientists[scientists['Age'] > scientists['Age'].mean()]

Unnamed: 0,Name,Born,Died,Age,Occupation
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [58]:
scientists['Age'] > scientists['Age'].mean()

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool

## DataFrame运算

In [59]:
# DataFrame 和 数值型数据运算
scientists * 2

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline FranklinRosaline Franklin,1920-07-251920-07-25,1958-04-161958-04-16,74,ChemistChemist
1,William GossetWilliam Gosset,1876-06-131876-06-13,1937-10-161937-10-16,122,StatisticianStatistician
2,Florence NightingaleFlorence Nightingale,1820-05-121820-05-12,1910-08-131910-08-13,180,NurseNurse
3,Marie CurieMarie Curie,1867-11-071867-11-07,1934-07-041934-07-04,132,ChemistChemist
4,Rachel CarsonRachel Carson,1907-05-271907-05-27,1964-04-141964-04-14,112,BiologistBiologist
5,John SnowJohn Snow,1813-03-151813-03-15,1858-06-161858-06-16,90,PhysicianPhysician
6,Alan TuringAlan Turing,1912-06-231912-06-23,1954-06-071954-06-07,82,Computer ScientistComputer Scientist
7,Johann GaussJohann Gauss,1777-04-301777-04-30,1855-02-231855-02-23,154,MathematicianMathematician


In [60]:
# DataFrame 和 另一 DataFrame 运算
scientists + scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline FranklinRosaline Franklin,1920-07-251920-07-25,1958-04-161958-04-16,74,ChemistChemist
1,William GossetWilliam Gosset,1876-06-131876-06-13,1937-10-161937-10-16,122,StatisticianStatistician
2,Florence NightingaleFlorence Nightingale,1820-05-121820-05-12,1910-08-131910-08-13,180,NurseNurse
3,Marie CurieMarie Curie,1867-11-071867-11-07,1934-07-041934-07-04,132,ChemistChemist
4,Rachel CarsonRachel Carson,1907-05-271907-05-27,1964-04-141964-04-14,112,BiologistBiologist
5,John SnowJohn Snow,1813-03-151813-03-15,1858-06-161858-06-16,90,PhysicianPhysician
6,Alan TuringAlan Turing,1912-06-231912-06-23,1954-06-071954-06-07,82,Computer ScientistComputer Scientist
7,Johann GaussJohann Gauss,1777-04-301777-04-30,1855-02-231855-02-23,154,MathematicianMathematician


In [61]:
# DataFrame 和 另一 DataFrame 运算
scientists + scientists[:4]

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline FranklinRosaline Franklin,1920-07-251920-07-25,1958-04-161958-04-16,74.0,ChemistChemist
1,William GossetWilliam Gosset,1876-06-131876-06-13,1937-10-161937-10-16,122.0,StatisticianStatistician
2,Florence NightingaleFlorence Nightingale,1820-05-121820-05-12,1910-08-131910-08-13,180.0,NurseNurse
3,Marie CurieMarie Curie,1867-11-071867-11-07,1934-07-041934-07-04,132.0,ChemistChemist
4,,,,,
5,,,,,
6,,,,,
7,,,,,


### 行标签和列表签操作

In [62]:
# 加载数据文件时，如果不指定行标签，Pandas会自动加上从0开始的行标签
# 可以通过df.set_index('列名')的方法重新将指定的列数据设置为行标签
scientists = pd.read_csv('./data/scientists.csv')
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [63]:
# 设置 Name 列的值作为行标签
scientists_df = scientists.set_index('Name')
scientists_df

Unnamed: 0_level_0,Born,Died,Age,Occupation
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
William Gosset,1876-06-13,1937-10-16,61,Statistician
Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
Marie Curie,1867-11-07,1934-07-04,66,Chemist
Rachel Carson,1907-05-27,1964-04-14,56,Biologist
John Snow,1813-03-15,1858-06-16,45,Physician
Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [64]:
# 注意：reset_index返回的是一个新的 DataFrame
scientists_df.reset_index()

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


### 加载数据时，指定某列数据作为 DataFrame 行标签

In [65]:
# 加载数据文件的时候，可以通过 index_col 参数，指定使用某一列数据作为行标签，index_col 参数可以指定列名或列位置编号·
pd.read_csv('./data/scientists.csv', index_col='Name')

Unnamed: 0_level_0,Born,Died,Age,Occupation
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
William Gosset,1876-06-13,1937-10-16,61,Statistician
Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
Marie Curie,1867-11-07,1934-07-04,66,Chemist
Rachel Carson,1907-05-27,1964-04-14,56,Biologist
John Snow,1813-03-15,1858-06-16,45,Physician
Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [66]:
pd.read_csv('./data/scientists.csv', index_col=0)

Unnamed: 0_level_0,Born,Died,Age,Occupation
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
William Gosset,1876-06-13,1937-10-16,61,Statistician
Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
Marie Curie,1867-11-07,1934-07-04,66,Chemist
Rachel Carson,1907-05-27,1964-04-14,56,Biologist
John Snow,1813-03-15,1858-06-16,45,Physician
Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


### 加载数据后，修改 DataFrame 行标签和列标签

In [67]:
scientists = pd.read_csv('./data/scientists.csv', index_col='Name')
scientists

Unnamed: 0_level_0,Born,Died,Age,Occupation
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
William Gosset,1876-06-13,1937-10-16,61,Statistician
Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
Marie Curie,1867-11-07,1934-07-04,66,Chemist
Rachel Carson,1907-05-27,1964-04-14,56,Biologist
John Snow,1813-03-15,1858-06-16,45,Physician
Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [68]:
# 使用 rename 修改行标签和列标签
index_name = {'Rosaline Franklin': 'rosaline franklin', 'John Snow': 'john snow'}
columns_name = {'Born': 'born', 'Age': 'age'}
# 注意：rename 修改之后，返回的是一个新的 DataFrame
scientists.rename(index=index_name, columns=columns_name)

Unnamed: 0_level_0,born,Died,age,Occupation
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
rosaline franklin,1920-07-25,1958-04-16,37,Chemist
William Gosset,1876-06-13,1937-10-16,61,Statistician
Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
Marie Curie,1867-11-07,1934-07-04,66,Chemist
Rachel Carson,1907-05-27,1964-04-14,56,Biologist
john snow,1813-03-15,1858-06-16,45,Physician
Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [69]:
# 使用 df.index 和 df.columns 分别修改行标签和列标签
# 修改行标签
scientists.index = ['rosaline franklin', 'William Gosset', 'Florence Nightingale',
       'Marie Curie', 'Rachel Carson', 'john snow', 'Alan Turing',
       'Johann Gauss']
# 修改列标签
scientists.columns = ['born', 'Died', 'age', 'Occupation']
scientists

Unnamed: 0,born,Died,age,Occupation
rosaline franklin,1920-07-25,1958-04-16,37,Chemist
William Gosset,1876-06-13,1937-10-16,61,Statistician
Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
Marie Curie,1867-11-07,1934-07-04,66,Chemist
Rachel Carson,1907-05-27,1964-04-14,56,Biologist
john snow,1813-03-15,1858-06-16,45,Physician
Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
Johann Gauss,1777-04-30,1855-02-23,77,Mathematician
