## 1、数据导入

In [2]:
import pandas as pd

In [3]:
df_csv = pd.read_csv('伦敦布伦特原油期货历史数据.csv')
df_csv.head()

Unnamed: 0,日期,收盘,开盘,高,低,交易量,涨跌幅
0,2024-5-1,83.44,85.8,85.89,83.29,482.83K,-5.03%
1,2024-4-30,87.86,88.41,88.79,87.46,14.19K,-0.61%
2,2024-4-29,88.4,89.22,89.29,88.11,99.95K,-1.23%
3,2024-4-26,89.5,89.23,89.83,88.78,113.70K,0.55%
4,2024-4-25,89.01,87.89,89.31,87.31,241.35K,1.12%


## 2、数据信息查看

In [4]:
data = df_csv

In [5]:
# 查看数据规模（维度）
data.shape

(611, 7)

In [6]:
# 查看各变量的数据类型
data.dtypes

日期      object
收盘     float64
开盘     float64
高      float64
低      float64
交易量     object
涨跌幅     object
dtype: object

In [9]:
# 把涨跌幅类型转为float
data['涨跌幅'] = data['涨跌幅'].str.rstrip('%').astype(float) / 100.0

In [28]:
# 把日期类型转为datetime
data['日期'] = pd.to_datetime(data['日期'])

In [30]:
# 查看各变量的数据类型
data.dtypes

日期     datetime64[ns]
收盘            float64
开盘            float64
高             float64
低             float64
交易量            object
涨跌幅           float64
dtype: object

In [29]:
# 查看数据整体信息
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 611 entries, 0 to 610
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   日期      611 non-null    datetime64[ns]
 1   收盘      611 non-null    float64       
 2   开盘      611 non-null    float64       
 3   高       611 non-null    float64       
 4   低       611 non-null    float64       
 5   交易量     610 non-null    object        
 6   涨跌幅     611 non-null    float64       
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 33.5+ KB


In [12]:
# 查看数据描述
data.describe()

Unnamed: 0,日期,收盘,开盘,高,低,涨跌幅
count,611,611.0,611.0,611.0,611.0,611.0
mean,2023-02-23 01:48:24.746317312,89.364517,89.362029,90.899165,87.755794,0.000489
min,2021-12-20 00:00:00,71.52,71.28,72.87,69.28,-0.1316
25%,2022-07-21 12:00:00,80.865,80.635,82.055,79.52,-0.0121
50%,2023-02-23 00:00:00,86.09,86.15,87.05,85.06,0.0021
75%,2023-09-26 12:00:00,94.61,94.5,96.255,92.77,0.015
max,2024-05-01 00:00:00,127.98,130.28,139.13,122.5,0.0879
std,,11.939569,11.950079,12.520757,11.33041,0.023624


In [13]:
# 列名
data.columns

Index(['日期', '收盘', '开盘', '高', '低', '交易量', '涨跌幅'], dtype='object')

In [14]:
# 查看前五行
data.head()

Unnamed: 0,日期,收盘,开盘,高,低,交易量,涨跌幅
0,2024-05-01,83.44,85.8,85.89,83.29,482.83K,-0.0503
1,2024-04-30,87.86,88.41,88.79,87.46,14.19K,-0.0061
2,2024-04-29,88.4,89.22,89.29,88.11,99.95K,-0.0123
3,2024-04-26,89.5,89.23,89.83,88.78,113.70K,0.0055
4,2024-04-25,89.01,87.89,89.31,87.31,241.35K,0.0112


In [15]:
# 查看后五行
data.tail()

Unnamed: 0,日期,收盘,开盘,高,低,交易量,涨跌幅
606,2021-12-24,76.14,76.72,76.79,75.75,30.54K,-0.0092
607,2021-12-23,76.85,75.47,77.0,74.78,161.24K,0.0207
608,2021-12-22,75.29,74.05,75.67,73.62,166.71K,0.0177
609,2021-12-21,73.98,71.88,74.27,71.24,214.07K,0.0344
610,2021-12-20,71.52,72.82,72.87,69.28,333.76K,-0.0272


## 3、数据清洗以及预处理

### 查找空值

In [16]:
# 查找空值 判断各变量中是否存在缺失值
data.isnull().any(axis = 0)

日期     False
收盘     False
开盘     False
高      False
低      False
交易量     True
涨跌幅    False
dtype: bool

In [17]:
# 查找空值 判断数据行中是否存在缺失值
data.isnull().any(axis = 1)

0      False
1      False
2      False
3      False
4      False
       ...  
606    False
607    False
608    False
609    False
610    False
Length: 611, dtype: bool

In [18]:
# 定位缺失值所在的行
data.loc[data.isnull().any(axis = 1)]

Unnamed: 0,日期,收盘,开盘,高,低,交易量,涨跌幅
423,2022-09-08,88.62,87.98,89.82,87.25,,0.007


In [19]:
# 各变量中缺失值的数量
data.isnull().sum(axis = 0)

日期     0
收盘     0
开盘     0
高      0
低      0
交易量    1
涨跌幅    0
dtype: int64

### 处理空值

In [21]:
# 直接删除空值数据所在行（不采用
# data3 = data.dropna()
# data3

In [20]:
# 定义缺失值为0
data_null_0 = data.fillna(0,inplace = False)
data_null_0

Unnamed: 0,日期,收盘,开盘,高,低,交易量,涨跌幅
0,2024-05-01,83.44,85.80,85.89,83.29,482.83K,-0.0503
1,2024-04-30,87.86,88.41,88.79,87.46,14.19K,-0.0061
2,2024-04-29,88.40,89.22,89.29,88.11,99.95K,-0.0123
3,2024-04-26,89.50,89.23,89.83,88.78,113.70K,0.0055
4,2024-04-25,89.01,87.89,89.31,87.31,241.35K,0.0112
...,...,...,...,...,...,...,...
606,2021-12-24,76.14,76.72,76.79,75.75,30.54K,-0.0092
607,2021-12-23,76.85,75.47,77.00,74.78,161.24K,0.0207
608,2021-12-22,75.29,74.05,75.67,73.62,166.71K,0.0177
609,2021-12-21,73.98,71.88,74.27,71.24,214.07K,0.0344


In [None]:
# # 对缺失值进行填充（用中位数或者平均数.....）
# # 采用平均值
# # 定义一个函数，用于将带有"k"的值转换为浮点数
# def k_to_float(value):
#     if isinstance(value, str) and value.endswith("K"):
#         return float(value[:-1]) * 1000
#     else:
#         return float(value)

# # 对包含"k"单位的列应用函数
# df_csv['交易量'] = df_csv['交易量'].apply(k_to_float)
# # 填充平均值
# data_null_value = data.fillna(value = {'交易量':data['交易量'].mean()},inplace = False)
# data_null_value

In [21]:
# 源数据未改变
data.isnull().sum(axis = 0)

日期     0
收盘     0
开盘     0
高      0
低      0
交易量    1
涨跌幅    0
dtype: int64

In [22]:
# 新数据已无空值
data_null_0.isnull().sum(axis = 0)

日期     0
收盘     0
开盘     0
高      0
低      0
交易量    0
涨跌幅    0
dtype: int64

In [23]:
# 开盘油价最高的情况
max_row_index = df_csv['开盘'].idxmax()
# 返回最大值所在的行
max_row = df_csv.iloc[max_row_index]
max_row

日期     2022-03-09 00:00:00
收盘                  111.14
开盘                  130.28
高                   131.64
低                    105.6
交易量                521.58K
涨跌幅                -0.1316
Name: 553, dtype: object

In [24]:
# 收盘油价最高的情况
max_row_index = df_csv['收盘'].idxmax()
# 返回最大值所在的行
max_row = df_csv.iloc[max_row_index]
max_row

日期     2022-03-08 00:00:00
收盘                  127.98
开盘                  124.56
高                   133.15
低                   121.31
交易量                469.99K
涨跌幅                 0.0387
Name: 554, dtype: object

In [25]:
# 开盘油价最低的情况
min_row_index = df_csv['开盘'].idxmin()
# 返回最小值所在的行
min_row = df_csv.iloc[min_row_index]
min_row

日期     2023-05-04 00:00:00
收盘                    72.5
开盘                   71.28
高                    73.72
低                    71.28
交易量                362.23K
涨跌幅                 0.0024
Name: 256, dtype: object

In [26]:
# 收盘油价最低的情况
min_row_index = df_csv['收盘'].idxmin()
# 返回最小值所在的行
min_row = df_csv.iloc[min_row_index]
min_row

日期     2021-12-20 00:00:00
收盘                   71.52
开盘                   72.82
高                    72.87
低                    69.28
交易量                333.76K
涨跌幅                -0.0272
Name: 610, dtype: object

In [27]:
# 导出新csv文件
data_null_0.to_csv('伦敦布伦特原油期货历史数据(清洗).csv',index=False)