In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [3]:
df = pd.read_csv('data.csv')
data = df.values
data

array([[  0.  ,   0.  ,   0.  , ...,  56.64,  85.57,   0.75],
       [  0.  ,   0.  ,   0.  , ...,  44.62,  45.28,   0.85],
       [  0.  ,   0.  ,   0.  , ...,  65.02,  82.69,   0.55],
       ...,
       [  0.  ,   0.  ,   0.  , ...,  61.73,  70.54,   0.65],
       [  0.  ,   0.  ,   0.  , ...,  86.54,  86.67,   0.5 ],
       [  1.  ,   0.  ,   1.  , ...,  38.02, 100.  ,   0.85]])

In [4]:
X = data[:,:-1]
y = data[:,-1]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

## SKlearn LinearRegression

In [6]:
reg = LinearRegression()

In [7]:
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [8]:
reg.coef_

array([-0.17784169,  0.00347086,  0.51373933, -0.00061146, -0.03593283,
       -0.03926314, -0.01683945, -0.40472057,  0.00196065,  0.03588212,
        0.00227268,  0.02624547,  0.01475503,  0.00276604, -0.00092719,
       -0.00052475])

In [9]:
reg.intercept_

0.7388333608706937

In [10]:
reg.score(X_test,y_test)

0.33241635109724876

## Pipeline

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
reg1 = Pipeline([('Scaler', StandardScaler()), ('LR', LinearRegression())])
reg1.fit(X_train, y_train)
reg1.score(X_test, y_test)

0.332416351097249

In [13]:
%%time
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
reg4 = Pipeline([('Scaler', StandardScaler()), ('RFR', RandomForestRegressor(n_estimators=500, min_samples_split=2))])
reg4.fit(X_train, y_train)
reg4.score(X_test, y_test)

CPU times: user 56.3 s, sys: 228 ms, total: 56.5 s
Wall time: 56.5 s


0.8923319207118063

In [15]:
list_temp = zip(reg4['RFR'].feature_importances_,  df.columns.values)
list_temp = sorted(list_temp, reverse=True)
print("'{}'       相关参数 ".format(y))
print('{:15s}  \t {:s}'.format('属性名称', '重要程度'))
print('--------------------------------' )
for value, name in list_temp:
    print('{:15s}  \t: {:.4f}'.format(name, value))

'[0.75 0.85 0.55 ... 0.65 0.5  0.85]'       相关参数 
属性名称             	 重要程度
--------------------------------
压后厚度mm           	: 0.1758
工具级别             	: 0.1543
板材类型             	: 0.1408
压合次数             	: 0.0983
芯板厚度mm           	: 0.0904
芯板残铜率%_0         	: 0.0746
芯板残铜率%_1         	: 0.0699
压合程序号            	: 0.0627
芯板叠层结构           	: 0.0324
预排工艺             	: 0.0264
下邻P片数            	: 0.0190
上邻P片数            	: 0.0169
缓冲材料使用栏          	: 0.0138
芯板铜厚OZ_0         	: 0.0115
芯板铜厚OZ_1         	: 0.0110
工具类型             	: 0.0022
