In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
import warnings
# filter warnings
warnings.filterwarnings('ignore')
# 正常显示中文
plt.rcParams['font.sans-serif'] = ['SimHei']
# 正常显示符号
from matplotlib import rcParams
rcParams['axes.unicode_minus']=False

ModuleNotFoundError: No module named 'sklearn.model_selection'; 'sklearn' is not a package

In [None]:
#模型
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor

In [None]:
data=pd.read_csv('JN_Data2.csv')
# print(data.shape)   #(103690, 68)
data=data.drop('Date/Time:',axis=1)

# 选取焦炉煤气COG的数据
data = data.loc[:,lambda df : data.columns.str.contains('^(COG)',regex=True)]
data=data.drop('COG_5M3',axis=1)
print(data.shape)   #(103690, 19)

In [None]:
#处理空值列和COG_8M3空值行
# print(data.info())
data = data.dropna(axis=1,how="all")   # 丢弃全为缺失值的那些列
data = data.drop('COG_H_U1',axis=1)
# print(data['COG_8M3'].isnull().value_counts())
data['COG_8M3'] = data['COG_8M3'].fillna('999')
# print(data[(data['COG_8M3']=='999')].index.tolist())
data = data.drop([51840, 51841, 51842, 51843, 51844, 51845, 51846, 51847, 51848,51849])
# print(data.shape)   #(103680, 16)
print(data.info())


In [None]:

#处理数据类型（dtype: float64）
data["COG_CC1"] = pd.to_numeric(data["COG_CC1"],errors='coerce')
data["COG_CC2"] = pd.to_numeric(data["COG_CC2"],errors='coerce')
data["COG_CC3"] = pd.to_numeric(data["COG_CC3"],errors='coerce')
data["COG_8M3"] = pd.to_numeric(data["COG_8M3"],errors='coerce')
data["COG_I_U1"] = pd.to_numeric(data["COG_I_U1"],errors='coerce')
data["COG_I_U2"] = pd.to_numeric(data["COG_I_U2"],errors='coerce')
data["COG_I_U3"] = pd.to_numeric(data["COG_I_U3"],errors='coerce')
data["COG_I_U4"] = pd.to_numeric(data["COG_I_U4"],errors='coerce')
data["COG_I_U5"] = pd.to_numeric(data["COG_I_U5"],errors='coerce')
data["COG_H_U2"] = pd.to_numeric(data["COG_H_U2"],errors='coerce')
data["COG_H_U3"] = pd.to_numeric(data["COG_H_U3"],errors='coerce')
data["COG_H_U5"] = pd.to_numeric(data["COG_H_U5"],errors='coerce')
data["COG_H_U6"] = pd.to_numeric(data["COG_H_U6"],errors='coerce')
data["COG_O_U1"] = pd.to_numeric(data["COG_O_U1"],errors='coerce')
data["COG_E_U1"] = pd.to_numeric(data["COG_E_U1"],errors='coerce')


In [None]:
data.info()

In [None]:
x = data.loc[:,data.columns != 'COG_8M3']
print('x.shape:',x.shape)      
y=data.loc[:,'COG_8M3']     
print('y.shape:',y.shape)   

In [None]:
#均值填充缺失值
mean_cols = x.mean()
# print(mean_cols)
x = x.fillna(mean_cols)  


In [None]:

#确认数据无误
print(x.info())
print(y.isnull().value_counts())


In [None]:
x.hist(figsize=(12, 10), bins=30, edgecolor="black")
plt.subplots_adjust(hspace=0.7, wspace=0.4)

In [None]:
x.describe()

In [None]:

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 1)


In [None]:
lr = LinearRegression()
lr.fit(x_train,y_train)
y_hat = lr.predict(x_test)
# y_test与y_hat的可视化
# 设置图片尺寸
plt.figure(figsize=(10, 6))
# 创建t变量
t = np.arange(len(x_test))
# 绘制y_test曲线
plt.plot(t, y_test, 'r', linewidth=2, label='true value')
# 绘制y_hat曲线
plt.plot(t, y_hat, 'g', linewidth=2, label='predict value')
# 设置图例
plt.legend()
plt.show()

In [None]:
knn = KNeighborsRegressor()
knn.fit(x_train,y_train)
y_hat = knn.predict(x_test)
# y_test与y_hat的可视化
# 设置图片尺寸
plt.figure(figsize=(10, 6))
# 创建t变量
t = np.arange(len(x_test))
# 绘制y_test曲线
plt.plot(t, y_test, 'r', linewidth=2, label='true value')
# 绘制y_hat曲线
plt.plot(t, y_hat, 'g', linewidth=2, label='predict value')
# 设置图例
plt.legend()
plt.show()

In [None]:

models=[LinearRegression(),KNeighborsRegressor(),DecisionTreeRegressor(),RandomForestRegressor(),AdaBoostRegressor()]
models_str=['LinearRegression','KNNRegressor','DecisionTree','RandomForest','AdaBoost']
#验证集分数
score_1=[]
#测试集分数
score_2=[]

for name,model in zip(models_str,models):
    print('开始训练模型：'+name)
    model = model   #建立模型
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)  
    score_trian = model.score(x_train,y_train)
    score_1.append(str(score_trian)[:5])
    score_test = model.score(x_test,y_test)
    score_2.append(str(score_test)[:5])


In [None]:


result = pd.DataFrame({'model':models_str, 'score_trian':score_1 ,'score_test':score_2})
print(result)

