#### 导入必要的库

In [1]:
# 导入必要的库
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# 机器学习相关库
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

#### 读取数据集

In [2]:
train_df = pd.read_excel('data/final_data_1.xlsx')
test_df = pd.read_excel('data/test_features.xlsx')

In [3]:
train_df.head()

Unnamed: 0,ID,TOOL,210X1,210X2,210X3,210X4,210X5,210X6,210X7,210X8,...,750X1441,750X1442,750X1444,750X1445,750X1447,750X1448,750X1450,750X1451,750X1452,Value
0,NH0995,O,100.5,0.933,0.14,2.004,48.11,1.937,-0.54,1.13,...,209.8,2400000000000,0.0008,0.0008,0.0008,0.0008,0.0008,0.0008,2400000000000,2.814025
1,NH0996,O,100.5,0.583,0.14,1.752,48.0,1.572,-0.53,0.9,...,209.7,2400000000000,0.00081,0.00081,0.00081,0.00081,0.00081,0.00081,2400000000000,2.782955
2,NH0997,O,100.5,0.859,0.14,1.848,48.07,1.824,-0.7,0.58,...,209.8,2400000000000,0.00081,0.00081,0.00081,0.00081,0.00081,0.00081,2400000000000,2.725973
3,NH0998,O,100.45,1.639,0.14,2.021,48.03,2.331,-1.08,1.16,...,209.8,2400000000000,0.00081,0.00081,0.00081,0.00081,0.00081,0.00081,2400000000000,2.794768
4,NH0999,N,89.9,0.397,0.24,1.684,49.46,1.13,3.24,-0.76,...,213.3,2400000000000,0.00094,0.00094,0.00094,0.00094,0.00094,0.00094,2400000000000,2.787539


In [4]:
test_df.head()

Unnamed: 0,ID,TOOL,210X1,210X2,210X3,210X4,210X5,210X6,210X7,210X8,...,750X1440,750X1441,750X1442,750X1444,750X1445,750X1447,750X1448,750X1450,750X1451,750X1452
0,NH1943,L,100.15,0.534,0.24,1.281,57.27,3.422,-2.08,0.39,...,1.3,200.1,2400000000000,0.00069,0.00069,0.00069,0.00069,0.00069,0.00069,2400000000000
1,NH1944,N,102.05,0.488,0.26,1.972,68.01,4.576,3.78,-0.82,...,1.3,199.9,2400000000000,0.00063,0.00063,0.00063,0.00063,0.00063,0.00063,2400000000000
2,NH1945,N,102.05,0.476,0.26,1.946,67.98,4.586,3.45,-0.84,...,1.3,199.6,2400000000000,0.0007,0.0007,0.0007,0.0007,0.0007,0.0007,2400000000000
3,NH1946,J,101.1,0.354,0.4,0.854,71.15,3.736,0.04,-0.29,...,1.4,205.7,2400000000000,0.00066,0.00066,0.00066,0.00066,0.00066,0.00066,2400000000000
4,NH1947,J,101.05,0.367,0.4,0.981,71.21,3.722,-0.06,-0.53,...,1.4,205.8,2400000000000,0.00072,0.00072,0.00072,0.00072,0.00072,0.00072,2400000000000


#### 提取测试集和训练集的ID并保存
1. 或可用于后续分离测试集和训练集

In [5]:
train_df_ids = train_df[['ID']]
test_df_ids = test_df[['ID']]

#### 提取训练集的目标列

In [6]:
train_df_target = train_df[['Value']]
train_df_target.head()

Unnamed: 0,Value
0,2.814025
1,2.782955
2,2.725973
3,2.794768
4,2.787539


#### 去除训练集里的目标列

In [7]:
columns = [col for col in train_df.columns if col not in ['Value']]
X_train = train_df[columns][1:]
y_train = train_df['Value'][1:]
X_test = test_df[columns][1:]
print(X_train.shape, y_train.shape, X_test.shape)

(799, 5953) (799,) (299, 5953)


#### 拼接数据集, 为数据预处理做准备

In [8]:
total_set = pd.concat([X_train, X_test], axis=0)
total_set.shape

(1098, 5953)

In [9]:
total_set.head()

Unnamed: 0,ID,TOOL,210X1,210X2,210X3,210X4,210X5,210X6,210X7,210X8,...,750X1440,750X1441,750X1442,750X1444,750X1445,750X1447,750X1448,750X1450,750X1451,750X1452
1,NH0996,O,100.5,0.583,0.14,1.752,48.0,1.572,-0.53,0.9,...,1.5,209.7,2400000000000,0.00081,0.00081,0.00081,0.00081,0.00081,0.00081,2400000000000
2,NH0997,O,100.5,0.859,0.14,1.848,48.07,1.824,-0.7,0.58,...,1.5,209.8,2400000000000,0.00081,0.00081,0.00081,0.00081,0.00081,0.00081,2400000000000
3,NH0998,O,100.45,1.639,0.14,2.021,48.03,2.331,-1.08,1.16,...,1.5,209.8,2400000000000,0.00081,0.00081,0.00081,0.00081,0.00081,0.00081,2400000000000
4,NH0999,N,89.9,0.397,0.24,1.684,49.46,1.13,3.24,-0.76,...,1.5,213.3,2400000000000,0.00094,0.00094,0.00094,0.00094,0.00094,0.00094,2400000000000
5,NH1000,L,100.4,0.501,0.25,1.471,51.2,1.034,-2.29,-0.05,...,1.5,213.2,2400000000000,0.00094,0.00094,0.00094,0.00094,0.00094,0.00094,2400000000000


#### 查看空值

In [None]:
nan_cols = total_set.isnull().sum()
nan_cols = nan_cols[nan_cols > 0]
nan_cols.sort_values(ascending=False).head(10)

344X59           1097
ERROR:#N/A_28    1097
344X340          1097
344X247          1097
344X343          1097
344X238          1097
344X223          1097
344X217          1097
344X346          1097
344X206          1097
dtype: int64