# Multiple Linear Regression

## Import packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

np.set_printoptions(precision=6, suppress=True)

## 데이터셋 로딩
- Modified Uber dataset
1. fare_amount: 우버 요금 지불 금액 (종속 변수, 목표 값)
2. pickup_x: 승객 탑승 x 좌표
3. pickup_y: 승객 탑승 y 좌표
4. dropoff_x: 승객 하차 x 좌표
5. dropoff_y: 승객 하차 y 좌표
6. passenser_count: 탑승 승객 수

In [3]:
# Download dataset file
# !wget "https://dongaackr-my.sharepoint.com/:x:/g/personal/sjkim_donga_ac_kr/EYMXYk25h2VBtPXtu3QaBqoBJK4cK-TI9mamHoFGJRYn5Q?e=8x2MTn&download=1" -q -O modified_uber.csv

# Load dataset file
data = pd.read_csv('modified_uber.csv')
data = data.drop(['Unnamed: 0.1', 'Unnamed: 0', 'key', 'pickup_datetime'], axis=1)
data = data.dropna()
data

Unnamed: 0,fare_amount,pickup_x,pickup_y,dropoff_x,dropoff_y,passenger_count
0,7.5,26.801401,31.975242,82.753859,78.351728,1
1,7.7,76.052494,88.508509,116.105293,92.327957,1
2,12.9,15.267869,43.955819,65.431587,61.653900,1
3,5.3,20.332390,62.963745,32.219453,119.648762,3
4,16.0,55.287436,0.890515,128.745703,53.748090,5
...,...,...,...,...,...,...
199994,3.0,84.707837,88.315857,119.137618,110.819151,1
199995,7.5,89.307465,90.446660,148.777837,101.245691,1
199996,30.9,11.018380,10.443077,45.211427,35.886442,2
199997,14.5,55.104242,33.397616,96.482991,103.718360,1


## L1, L2 이동거리 계산 및 열 추가

**L1 Distance (맨해튼 거리):**

L1_distance = |pickup_x - dropoff_x| + |pickup_y - dropoff_y|

**L2 Distance (유클리드 거리):**

L2_distance = √((pickup_x - dropoff_x)² + (pickup_y - dropoff_y)²)

- 절댓값 계산 함수: np.abs()
- 제곱근 (square root) 계산 함수: np.sqrt()

In [9]:


data['L1_distance'] = np.abs(data['pickup_x'] - data['dropoff_x']) + np.abs(data['pickup_y'] - data['dropoff_y'])
data['L2_distance'] = np.sqrt((data['pickup_x'] - data['dropoff_x'])**2 + (data['pickup_y'] - data['dropoff_y'])**2)


data # L1/L2_distance 열이 정상적으로 계산/추가되었는지 확인

Unnamed: 0,fare_amount,pickup_x,pickup_y,dropoff_x,dropoff_y,passenger_count,L1_distance,L2_distance
0,7.5,26.801401,31.975242,82.753859,78.351728,1,102.328943,72.673626
1,7.7,76.052494,88.508509,116.105293,92.327957,1,43.872247,40.234499
2,12.9,15.267869,43.955819,65.431587,61.653900,1,67.861798,53.194179
3,5.3,20.332390,62.963745,32.219453,119.648762,3,68.572080,57.917988
4,16.0,55.287436,0.890515,128.745703,53.748090,5,126.315842,90.498841
...,...,...,...,...,...,...,...,...
199994,3.0,84.707837,88.315857,119.137618,110.819151,1,56.933076,41.131595
199995,7.5,89.307465,90.446660,148.777837,101.245691,1,70.269403,60.442901
199996,30.9,11.018380,10.443077,45.211427,35.886442,2,59.636412,42.620761
199997,14.5,55.104242,33.397616,96.482991,103.718360,1,111.699492,81.591714


## 데이터셋 전처리 (가우시안 정규화)

In [10]:
data = data.dropna()

# 데이터셋 가우시안 정규화
data_normalized = (data - data.mean()) / data.std()
data_normalized

Unnamed: 0,fare_amount,pickup_x,pickup_y,dropoff_x,dropoff_y,passenger_count,L1_distance,L2_distance
0,-0.389819,-0.802319,-0.625718,-0.022205,-0.144155,-0.493900,1.116659,0.947648
1,-0.369620,0.902317,1.331358,0.909427,0.246986,-0.493900,-0.732034,-0.489573
2,0.155539,-1.201508,-0.210973,-0.506083,-0.611465,-0.493900,0.026636,0.084608
3,-0.612001,-1.026219,0.447046,-1.433824,1.011592,0.949106,0.049098,0.293897
4,0.468615,0.183615,-1.701813,1.262522,-0.832718,2.392113,1.875245,1.737397
...,...,...,...,...,...,...,...,...
199994,-0.844283,1.201889,1.324689,0.994132,0.764485,-0.493900,-0.318985,-0.449827
199995,-0.389819,1.361087,1.398454,1.822096,0.496560,-0.493900,0.102776,0.405764
199996,1.973398,-1.348587,-1.371121,-1.070909,-1.332598,0.227603,-0.233492,-0.383849
199997,0.317126,0.177275,-0.576478,0.361302,0.565761,-0.493900,1.413003,1.342766


## 택시요금 예측에 사용할 데이터 지정

In [11]:

# 예측에 사용할 데이터들에 대한 2차원 행렬 변환
X = np.array(data_normalized[['L1_distance', 'L2_distance', 'passenger_count']]) # 입력 데이터 설정
Y = np.array(data_normalized[['fare_amount']])

# Train dataset / Test dataset 분할
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1234)

# Train dataset 형상 확인
print(X_train.shape)
print(Y_train.shape)

(159999, 3)
(159999, 1)


## Least Square Method 기반 선형 회귀 모델 작성

- Least Square Method:
$$\theta = (X^T \cdot X)^{-1} \cdot (X^T \cdot Y)$$


In [12]:
class LinearRegression_LSM():

  def __init__(self):
    self.theta = None

  def fit(self, X, Y):
    N = X.shape[0] # N = 입력 데이터 개수

    # 입력 X에 대해 bias 차원 추가
    bias = np.ones((N, 1))    # N x 1
    X = np.hstack([X, bias])  # N x 2

    # theta (W, b) 저장을 위한 배열 초기화
    self.theta = np.zeros(X.shape[1])

    # Least Square Method 수행
    XT = X.T
    XTX = np.dot(XT, X)
    XTX_inverse = np.linalg.inv(XTX)
    XTY = np.dot(XT, Y)

    self.theta = np.dot(XTX_inverse, XTY)

    return self.theta

  def predict(self, X):
    N = X.shape[0] # N = 입력 데이터 개수

    # 입력 X에 대해 bias 차원 추가
    bias = np.ones((N, 1)) # N x 1
    X = np.hstack([X, bias]) # N x 2

    Y_hat = np.dot(X, self.theta)
    return Y_hat

## X_train, Y_train 데이터를 이용한 linear regression 수행 (학습)

In [13]:
model_LSM = LinearRegression_LSM()
theta = model_LSM.fit(X_train, Y_train)


print(f"theta = {theta}")

theta = [[-0.638591]
 [ 1.126015]
 [ 0.009859]
 [ 0.000558]]


## X_test, Y_test 데이터를 이용한 linear regression 성능 검증 (테스트)

$$
\text{MSE} = \frac{1}{n} \sum_{i=1}^{n} (Y_i - \hat{Y}_i)^2
$$

In [14]:
# 정답 데이터와 예측 데이터 간 차이 계산
def MSE (Y, Y_hat):
  error = Y - Y_hat
  mse = np.mean(error ** 2)
  return mse

In [15]:
Y_hat = model_LSM.predict(X_test)
print(MSE(Y_test, Y_hat))

0.7244597712652048


## 임의 데이터 X 입력 시 Y_hat 예측

**가우시안 정규화:**

$$
x' = \frac{x - \mu}{\sigma}
$$

**가우시안 역정규화:**

$$
x = x' \cdot \sigma + \mu
$$

In [17]:
# 가우시안 정규화/역정규화를 위한 평균, 표준편차 저장
mean_array = data.mean()
std_array = data.std()

mean_L1 = mean_array['L1_distance']
mean_L2 = mean_array['L2_distance']
mean_passenger = mean_array['passenger_count']
mean_fare = mean_array['fare_amount']

std_L1 = std_array['L1_distance']
std_L2 = std_array['L2_distance']
std_passenger = std_array['passenger_count']
std_fare = std_array['fare_amount']

# 임의 데이터 X 생성
L1_distance = 250
L2_distance = 197
passenger_count = 2

# 각 입력 변수 X에 대한 정규화 수행
L1_distance_norm = (L1_distance - mean_L1) / std_L1
L2_distance_norm = (L2_distance - mean_L2) / std_L2
passenger_count_norm = (passenger_count - mean_passenger) / std_passenger

X_new = np.array([[L1_distance_norm, L2_distance_norm, passenger_count_norm]])

# 학습한 모델 theta를 이용해 Y_hat 예측
Y_hat = model_LSM.predict(X_new)

# 출력 변수 Y에 대한 역정규화 수행
Y_hat = (Y_hat * std_fare) + mean_fare

print(f"Y_hat = {Y_hat}")

Y_hat = [[46.777742]]
