In [1]:
import pandas as pd
import numpy as np

## Step1: 数据预处理

### 1.1  导入原始数据

In [2]:
dataset = pd.read_csv('../dataset/50_Startups.csv')
dataset.head()
X = dataset.iloc[ : , :-1].values
Y = dataset.iloc[ : ,  4 ].values
X[:5]
Y[:5]

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida']], dtype=object)

array([192261.83, 191792.06, 191050.39, 182901.99, 166187.94])

### 1.2  检查缺失值
使用均值代替缺失值

In [3]:
from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values = "NaN", strategy = "mean", axis = 0)  # NaN
imputer = imputer.fit(X[ : , :3])
X[:5]
X[ : , :3] = imputer.transform(X[ : , :3])
X[:5]



array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida']], dtype=object)

array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida']], dtype=object)

### 1.3 类别标签数字化
  - 'California':  0  -> [1, 0, 0] -> [0, 0]
  - 'Florida': 1 -> [0, 1, 0] -> [1, 0]
  - 'New York': 2 -> [0, 0 ,1] -> [0, 1]

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labelencoder = LabelEncoder()
X[: , 3] = labelencoder.fit_transform(X[ : , 3])
X[:5]
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()
X[:5]

array([[165349.2, 136897.8, 471784.1, 2],
       [162597.7, 151377.59, 443898.53, 0],
       [153441.51, 101145.55, 407934.54, 1],
       [144372.41, 118671.85, 383199.62, 2],
       [142107.34, 91391.77, 366168.42, 1]], dtype=object)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.6534920e+05,
        1.3689780e+05, 4.7178410e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.6259770e+05,
        1.5137759e+05, 4.4389853e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.5344151e+05,
        1.0114555e+05, 4.0793454e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.4437241e+05,
        1.1867185e+05, 3.8319962e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.4210734e+05,
        9.1391770e+04, 3.6616842e+05]])

#### 躲避虚拟变量陷阱

In [5]:
X = X[: , 1:]
X[:5]

array([[0.0000000e+00, 1.0000000e+00, 1.6534920e+05, 1.3689780e+05,
        4.7178410e+05],
       [0.0000000e+00, 0.0000000e+00, 1.6259770e+05, 1.5137759e+05,
        4.4389853e+05],
       [1.0000000e+00, 0.0000000e+00, 1.5344151e+05, 1.0114555e+05,
        4.0793454e+05],
       [0.0000000e+00, 1.0000000e+00, 1.4437241e+05, 1.1867185e+05,
        3.8319962e+05],
       [1.0000000e+00, 0.0000000e+00, 1.4210734e+05, 9.1391770e+04,
        3.6616842e+05]])

### 1.4 拆分数据集

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
X_train[:5], X_test[:5], Y_train[:5], Y_test[:5]

(array([[1.0000000e+00, 0.0000000e+00, 5.5493950e+04, 1.0305749e+05,
         2.1463481e+05],
        [0.0000000e+00, 1.0000000e+00, 4.6014020e+04, 8.5047440e+04,
         2.0551764e+05],
        [1.0000000e+00, 0.0000000e+00, 7.5328870e+04, 1.4413598e+05,
         1.3405007e+05],
        [0.0000000e+00, 0.0000000e+00, 4.6426070e+04, 1.5769392e+05,
         2.1079767e+05],
        [1.0000000e+00, 0.0000000e+00, 9.1749160e+04, 1.1417579e+05,
         2.9491957e+05]]),
 array([[1.0000000e+00, 0.0000000e+00, 6.6051520e+04, 1.8264556e+05,
         1.1814820e+05],
        [0.0000000e+00, 0.0000000e+00, 1.0067196e+05, 9.1790610e+04,
         2.4974455e+05],
        [1.0000000e+00, 0.0000000e+00, 1.0191308e+05, 1.1059411e+05,
         2.2916095e+05],
        [1.0000000e+00, 0.0000000e+00, 2.7892920e+04, 8.4710770e+04,
         1.6447071e+05],
        [1.0000000e+00, 0.0000000e+00, 1.5344151e+05, 1.0114555e+05,
         4.0793454e+05]]),
 array([ 96778.92,  96479.51, 105733.54,  96712.8 , 1242

## Step2 训练模型

###   多元线性回归

In [7]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
reg = regressor.fit(X_train, Y_train)
reg
"相关系数", reg.coef_
# reg.intercept_
"R^2", reg.score(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

('相关系数',
 array([-9.59284160e+02,  6.99369053e+02,  7.73467193e-01,  3.28845975e-02,
         3.66100259e-02]))

('R^2', 0.9501847627493607)

## Step3 结果预测

In [8]:
y_pred = regressor.predict(X_test)
X_test[:5]
y_pred

array([[1.0000000e+00, 0.0000000e+00, 6.6051520e+04, 1.8264556e+05,
        1.1814820e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0067196e+05, 9.1790610e+04,
        2.4974455e+05],
       [1.0000000e+00, 0.0000000e+00, 1.0191308e+05, 1.1059411e+05,
        2.2916095e+05],
       [1.0000000e+00, 0.0000000e+00, 2.7892920e+04, 8.4710770e+04,
        1.6447071e+05],
       [1.0000000e+00, 0.0000000e+00, 1.5344151e+05, 1.0114555e+05,
        4.0793454e+05]])

array([103015.20159796, 132582.27760816, 132447.73845175,  71976.09851259,
       178537.48221054, 116161.24230163,  67851.69209676,  98791.73374688,
       113969.43533012, 167921.0656955 ])

In [9]:
Y_test

array([103282.38, 144259.4 , 146121.95,  77798.83, 191050.39, 105008.31,
        81229.06,  97483.56, 110352.25, 166187.94])