# Topic : Lecture 3 Multi-Linear regression
<img src="https://www.tribloom.com/wp-content/uploads/2019/08/CRISP-DM_Process_Diagram-768x769.png" height=300>

Follow the CRSIP-DM method
1. Step 1: Import library, import data
2. Step 2: Pre-processing (missing data, categorical type, normalization, format transform)
3. Step 3: Build ML Model
4. Step 4: Evaluate Model
5. Step 5: Deploy (Prediction)


In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Step 1: Load data (also import library)

In [11]:
# import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#import data
data=pd.read_csv("dataset_50_Startups.csv")
print(data.head()) # show first 5 items
print(type(data))
print(data.info())




   R&D Spend  Administration  Marketing Spend       State     Profit
0  165349.20       136897.80        471784.10    New York  192261.83
1  162597.70       151377.59        443898.53  California  191792.06
2  153441.51       101145.55        407934.54     Florida  191050.39
3  144372.41       118671.85        383199.62    New York  182901.99
4  142107.34        91391.77        366168.42     Florida  166187.94
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB
None


# Step 2: Pre-process X, Y
* 資料型別的 transformation) format transform (轉換成numpy format)
* (missing data=> imputation, 
* normalization
* data type 例如 categorical data onehot encoding, Label_Encoding, padas 套件有一個簡單的 get_dummies 的API



In [12]:
#1. (missing data=> imputation,
# no missing data

#2. normalization
# Skip sklearn linear model 會幫我們做

#3. nominal data 的轉換 ==> pandas 下面 pd.getdummies
#==> X 5個特徵 1Y

X=data.iloc[:,:-1]
Y=data.iloc[:,-1]
print(X.columns)
X.info()
X=pd.get_dummies(X)
print(X.columns)
X.info()
X=X.values.reshape(-1,6)
Y=Y.values.reshape(-1,1)
#4. 資料型別的 transformation) format transform (轉換成numpy format)


X=X[:,:-1] 
print(type(X),X.shape)
print(type(Y),Y.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
print(type(X_train),X_train.shape)
print(type(Y_train),Y_train.shape)

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
dtypes: float64(3), object(1)
memory usage: 1.7+ KB
Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State_California',
       'State_Florida', 'State_New York'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   R&D Spend         50 non-null     float64
 1   Administration    50 non-null     float64
 2   Marketing Spend   50 non-null     float64
 3   Sta

# Step 3: Build Model for training

In [13]:
from sklearn.linear_model import LinearRegression as LR
model=LR()  # constructor
#all in policy
model.fit(X_train,Y_train) # training==> find a* and b*

print(model) 
print("a*=",model.coef_,"b*=",model.intercept_, )


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
a*= [[ 7.73467193e-01  3.28845975e-02  3.66100259e-02 -6.99369053e+02
  -1.65865321e+03]] b*= [43253.53667068]


# Step 4: Evalute Model

In [14]:
from sklearn.metrics import r2_score as R2
from sklearn.metrics import mean_squared_error as MSE
yPre=model.predict(X_train)
print("MSE=",MSE(Y_train,yPre))
print("R2=",R2(Y_train,yPre))


#backward selection 
#檢查p-values select 重要的特徵
import statsmodels.api as sm
X_train = np.append(arr = np.ones((40, 1)).astype(int), values = X_train, axis = 1)
X_opt = X_train [:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = sm.OLS(endog = Y_train, exog = X_opt).fit()
print("===================================================")
print('0-5', regressor_OLS.summary())
print("====================================================")


X_opt = X_train [:, [0, 1, 2, 3, 5]]
regressor_OLS = sm.OLS(endog = Y_train, exog = X_opt).fit()
print("====================================================")
print('01345',regressor_OLS.summary())
print("====================================================")

X_opt = X_train [:, [0, 3, 4, 5]]
regressor_OLS = sm.OLS(endog = Y_train, exog = X_opt).fit()
print("====================================================")
print('0345',regressor_OLS.summary())
print("====================================================")


X_opt = X_train [:, [0, 3, 5]]
regressor_OLS = sm.OLS(endog = Y_train, exog = X_opt).fit()
print("====================================================")
print('035',regressor_OLS.summary())
print("====================================================")


X_opt = X_train [:, [0, 3]]
regressor_OLS = sm.OLS(endog = Y_train, exog = X_opt).fit()
print("====================================================")
print('03',regressor_OLS.summary())
print("====================================================")




MSE= 81571001.8007737
R2= 0.9501847627493607
0-5                             OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.950
Model:                            OLS   Adj. R-squared:                  0.943
Method:                 Least Squares   F-statistic:                     129.7
Date:                Tue, 12 Oct 2021   Prob (F-statistic):           3.91e-21
Time:                        10:40:01   Log-Likelihood:                -421.10
No. Observations:                  40   AIC:                             854.2
Df Residuals:                      34   BIC:                             864.3
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
con

# Step 5: Deploy Model to predict new value

In [15]:
data.to_csv("result.csv", index=False, mode='w')