In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [7]:
df.shape

(244, 7)

**Simple Linear Regression**

In [5]:
# Simple linear regression --> 1 independent variable (feature) dan 1 dependent variable (label)

X = df[['total_bill']]
y = df['tip']

# Use linreg model
linreg = LinearRegression()

In [26]:
df[['total_bill']].values.reshape(-1, 1)

array([[16.99],
       [10.34],
       [21.01],
       [23.68],
       [24.59],
       [25.29],
       [ 8.77],
       [26.88],
       [15.04],
       [14.78],
       [10.27],
       [35.26],
       [15.42],
       [18.43],
       [14.83],
       [21.58],
       [10.33],
       [16.29],
       [16.97],
       [20.65],
       [17.92],
       [20.29],
       [15.77],
       [39.42],
       [19.82],
       [17.81],
       [13.37],
       [12.69],
       [21.7 ],
       [19.65],
       [ 9.55],
       [18.35],
       [15.06],
       [20.69],
       [17.78],
       [24.06],
       [16.31],
       [16.93],
       [18.69],
       [31.27],
       [16.04],
       [17.46],
       [13.94],
       [ 9.68],
       [30.4 ],
       [18.29],
       [22.23],
       [32.4 ],
       [28.55],
       [18.04],
       [12.54],
       [10.29],
       [34.81],
       [ 9.94],
       [25.56],
       [19.49],
       [38.01],
       [26.41],
       [11.24],
       [48.27],
       [20.29],
       [13.81],
       [

In [6]:
# Fitting or training model
linreg.fit(X, y)

LinearRegression()

In [10]:
# Create new total_bill data
X_new = [[30]]

# Predict the new total_bill data
y_pred = linreg.predict(X_new)
y_pred

array([4.07100514])

In [8]:
# Check model evaluation metrics (R2 -- R-squared)
# R2 merepresentasikan seberapa bagus model dalam menggambarkan kondisi data.
# Semakin R2 mendekati 1, semakin baik pula model kita.
linreg.score(X, y)

0.45661658635167657

In [12]:
linreg.coef_

array([0.10502452])

In [13]:
linreg.intercept_

0.9202696135546731

$$ y-hat = 0.92 + 0.105 * x1 $$

**Multiple Linear Regression**

In [14]:
# Define X and y
X = df[['total_bill', 'size']]
y = df['tip']

# Use linreg model
linreg = LinearRegression()

In [15]:
linreg.fit(X, y)

LinearRegression()

In [16]:
linreg.coef_

array([0.09271334, 0.19259779])

In [17]:
linreg.intercept_

0.6689447408125027

$$ y-hat = 0.668 + 0.092 * x1 + 0.192 * x2 $$

In [10]:
X_new = [[30, 3]]

In [11]:
linreg.predict(X_new)

array([4.02813823])

In [12]:
linreg.score(X, y)

0.46786930879612587

**3 Features**

In [13]:
df['time'].value_counts()

Dinner    176
Lunch      68
Name: time, dtype: int64

In [18]:
df['is_dinner'] = df['time'].apply(lambda x: 1 if x == 'Dinner' else 0)
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,is_dinner
0,16.99,1.01,Female,No,Sun,Dinner,2,1
1,10.34,1.66,Male,No,Sun,Dinner,3,1
2,21.01,3.5,Male,No,Sun,Dinner,3,1
3,23.68,3.31,Male,No,Sun,Dinner,2,1
4,24.59,3.61,Female,No,Sun,Dinner,4,1


In [15]:
df['is_dinner'].value_counts()

1    176
0     68
Name: time_binary, dtype: int64

In [19]:
X = df[['total_bill', 'size', 'is_dinner']]
y = df['tip']

linreg = LinearRegression()

In [20]:
linreg.fit(X, y)

LinearRegression()

In [21]:
linreg.predict([[30, 3, 1]])

array([4.02738159])

In [22]:
linreg.score(X, y)

0.4678710451921011

In [23]:
# y-intercept (beta0)
linreg.intercept_

0.6711919666814037

In [24]:
# beta1, beta2, beta3
linreg.coef_

array([ 0.09275268,  0.19257911, -0.00412813])

$$ y-hat = 0.67 + 0.09 * x1 + 0.192 * x2 - 0.004 * x2 $$