In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVR
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

import seaborn as sns

In [60]:
# 每10分钟采样一个点，输出DataFrame
class RandomForestModel:
    def __init__(self, data_path_1, data_path_2, 
                 interval_minutes=10,
                 pic_save_dir = r'E:\OtherProfit\Program\PersonalProfit\RandomForest\PicSaveDir'
                 ):
        # 两个数据文件的路径
        self.data_path_1  = data_path_1
        self.data_path_2 = data_path_2
        
        # 采样时间间隔（分钟数）
        self.interval_minutes = interval_minutes
        
        # DataFrame
        self.data_pd_1 = self.__load_pd(self.data_path_1)
        self.data_pd_2 = self.__load_pd(self.data_path_2)
        
        # 图片保存路径
        self.pic_save_dir = pic_save_dir
    
    #  每10分钟采样一个点，输出DataFrame
    def __load_pd(self, file_path):
        data_pd = pd.read_csv(file_path)
        threshold = 0
        index_pos = 0
        res_pd = pd.DataFrame()
        while index_pos < data_pd.shape[0]:
            if float(data_pd.loc[index_pos, 'Time'])*60 > threshold:
                threshold += self.interval_minutes
                add_pd = data_pd.loc[index_pos, :]
                res_pd = pd.concat([res_pd, pd.DataFrame(add_pd).T],axis=0,sort=False)
            index_pos += 1
        res_pd = res_pd.sort_index(axis=0,ascending=True)
        res_pd.index = [i for i in range(res_pd.shape[0])]
        return res_pd
    
    # 绘制曲线图
    def plot_voltage(self):
        plt.plot(self.data_pd_1['Time'],self.data_pd_1['Utot'],label=r'FC 1',color='b',linewidth='1')
        plt.plot(self.data_pd_2['Time'],self.data_pd_2['Utot'],label=r'FC 2',color='r',linewidth='1')
        plt.title(r'Total voltage')
        plt.xlabel('Time(h)')
        plt.ylabel(r'Voltage(V)')
        plt.legend()
        plt.savefig(self.pic_save_dir + r'\\' + r'Total voltage' + '.png')
        plt.show()
        
    # 特征提取,基于变异系数删除特征
    def del_feature_cv(self, data_pd):
        res_pd = data_pd.copy()
        for feature in data_pd.columns:
            if modeling.data_pd_1[feature].std()/modeling.data_pd_1[feature].mean() > 0.15:
                res_pd = res_pd.drop([feature], axis=1)
        return res_pd
    
    # 生成相关系数热力图
    def plot_heat_map(self):
        
        pass
    
    # 划分训练集与测试集合
    def train_test_split(self, data_pd):
        train_validate_pd = data_pd.loc[:round(data_pd.shape[0]*0.7),:]
        test_pd = data_pd.loc[round(data_pd.shape[0]*0.7):,:]
        return  train_validate_pd, test_pd
    
    # 输入DataFrame,输出x,y
    def x_y_split(self, data_pd):
        tmp_pd = data_pd.copy()
        y = tmp_pd['Utot']
        x = tmp_pd.drop(['Utot'],axis=1)
        return x, y
    
    
    # 数据处理，利用lasso回归的l1正则化来选取特征
    def select_feature_lasso(self, data_pd):
        x,y = self.x_y_split(data_pd)
        
        reg = Lasso()
        model = SelectFromModel(reg)
        x_new = model.fit_transform(x,y)
        print(x_new.shape)
        return x_new
        
    # pipeline
    
    

In [61]:
data_path_1 = r'E:\OtherProfit\Program\PersonalProfit\RandomForest\DataSource\fc1all_data.csv'
data_path_2 = r'E:\OtherProfit\Program\PersonalProfit\RandomForest\DataSource\fc2all_data.csv'
modeling = RandomForestModel(data_path_1=data_path_1, data_path_2=data_path_2, interval_minutes=10)

In [62]:
# 绘制总电压
#modeling.plot_voltage()

In [63]:
train_validate_pd, test_pd = modeling.train_test_split(modeling.data_pd_1)

In [64]:
x_new = modeling.select_feature_lasso(train_validate_pd)

(4849, 1)


In [65]:
x_new


array([[1.56389000e-04],
       [1.66725000e-01],
       [3.33412222e-01],
       ...,
       [8.07674478e+02],
       [8.07833882e+02],
       [8.08001678e+02]])