# Linear Regression - Breast Cancer

參考範例：https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

## Load Dataset
區分uinput和output

In [2]:
# Load the BreastCancer dataset
df =  pd.read_csv('BreastCancer.csv')
# Label of data > y
breastCancer_y = df['diagnosis'].values

# Training data > X
breastCancer_X = df.iloc[:,1:]
breastCancer_X = breastCancer_X.values

df

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,0,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,0,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,0,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,0,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,0,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,0,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,0,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,0,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


## Train the LR model & Do predict
其實train的部分可以一行搞定，測試集就直接用訓練集。

In [3]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(breastCancer_X, breastCancer_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [4]:
# Make predictions using the testing set = training sets
breastCancer_y_pred = regr.predict(breastCancer_X)

## Reveal Results
這邊的結果有Coefficients各項係數(30個)，Intercept截距，Mean squared error預測的標準差，以及Score模型的精準度。
- np array scientific notation to float https://docs.scipy.org/doc/numpy/reference/generated/numpy.set_printoptions.html

In [5]:
np.set_printoptions(precision=4, suppress=True)

# The coefficients 各項的係數/權重，有30個
print('Coefficients: \n', regr.coef_)
# The intercept 截距
print('Intercept: ', regr.intercept_)
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(breastCancer_y, breastCancer_y_pred))
# The coefficient of determination: 1 is perfect prediction 沒測試集so代原本訓練集
print('Score: %.2f'
      % regr.score(breastCancer_X, breastCancer_y))

Coefficients: 
 [  0.2178  -0.0045  -0.0237  -0.0003  -0.0847   4.222   -1.398   -2.1418
  -0.1027  -0.0333  -0.435    0.0068   0.0225   0.0009 -15.8543  -0.0649
   3.5655 -10.568   -1.6973   7.1464  -0.1952  -0.0072   0.0024   0.001
  -0.5429  -0.0672  -0.3812  -0.4643  -0.5568  -4.3035]
Intercept:  3.021811738437393
Mean squared error: 0.05
Score: 0.77
