In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({ 
    "profit": [32, 65, 70, 85, 40, 95, 120, 55, 78, 110], 
    "marketing_spend": [20, 35, 40, 50, 25, 60, 75, 30, 45, 70], 
    "employees": [5, 7, 8, 10, 6, 11, 14, 7, 9, 13], 
    "region": ["North", "South", "North", "East", "South", 
               "East", "North", "West", "West", "East"] 
    }
)
df

Unnamed: 0,profit,marketing_spend,employees,region
0,32,20,5,North
1,65,35,7,South
2,70,40,8,North
3,85,50,10,East
4,40,25,6,South
5,95,60,11,East
6,120,75,14,North
7,55,30,7,West
8,78,45,9,West
9,110,70,13,East


Exercise 1 – pandas → NumPy Interface
Tasks: 
1. Select X = marketing_spend, employees and y = profit 
2. Convert X and y to NumPy arrays 
3. Print array shapes 
4. Add an intercept column 
5. Estimate regression coefficients using numpy.linalg.lstsq

In [3]:
# 1) Select X and y
X = df[["marketing_spend", "employees"]]
y = df["profit"]

# 2) Convert to NumPy arrays
X_np = X.to_numpy(dtype=float)
y_np = y.to_numpy(dtype=float)

# 3) Print shapes
print("X shape:", X_np.shape)  # (n, p)
print("y shape:", y_np.shape)  # (n,)

# 4) Add intercept column (1s)
X_design = np.c_[np.ones(len(df)), X_np]
print("X_design shape:", X_design.shape)  # (n, p+1)

# 5) Estimate coefficients using least squares
beta, residuals, rank, s = np.linalg.lstsq(X_design, y_np, rcond=None)

print("beta (intercept, b_marketing, b_employees):", beta)
print("residuals:", residuals)
print("rank:", rank)

X shape: (10, 2)
y shape: (10,)
X_design shape: (10, 3)
beta (intercept, b_marketing, b_employees): [ 9.525  1.75  -1.475]
residuals: [118.325]
rank: 3


Exercise 2 – Model Design with Patsy 
Tasks: 
1. Create design matrices for: profit ~ marketing_spend + employees 
2. Print shape and column names 
3. Extend model to include region 
4. Print new column names

In [5]:
%pip install patsy

Defaulting to user installation because normal site-packages is not writeable
Collecting patsy
  Downloading patsy-1.0.2-py2.py3-none-any.whl.metadata (3.6 kB)
Downloading patsy-1.0.2-py2.py3-none-any.whl (233 kB)
Installing collected packages: patsy
Successfully installed patsy-1.0.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\ADMIN\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


1) Design matrices (model cơ bản)
Dùng patsy.dmatrices để tạo y và X cho mô hình profit ~ marketing_spend + employees.

In [6]:
import pandas as pd
import patsy

# 1) Design matrices
y1, X1 = patsy.dmatrices(
    "profit ~ marketing_spend + employees",
    data=df,
    return_type="dataframe"
)

# 2) Shape + column names
print("y1 shape:", y1.shape)
print("X1 shape:", X1.shape)
print("X1 columns:", list(X1.columns))

y1 shape: (10, 1)
X1 shape: (10, 3)
X1 columns: ['Intercept', 'marketing_spend', 'employees']


2) Mở rộng mô hình có region
Thêm biến phân loại region vào công thức: profit ~ marketing_spend + employees + region.

In [7]:
# 3) Extend model to include region
y2, X2 = patsy.dmatrices(
    "profit ~ marketing_spend + employees + region",
    data=df,
    return_type="dataframe"
)

# 4) New column names
print("X2 shape:", X2.shape)
print("X2 columns:", list(X2.columns))


X2 shape: (10, 6)
X2 columns: ['Intercept', 'region[T.North]', 'region[T.South]', 'region[T.West]', 'marketing_spend', 'employees']


Exercise 3 – statsmodels OLS Regression 
Tasks: 
1. Fit model: profit ~ marketing_spend + employees 
2. Display summary 
3. Extract coefficients and R-squared 
4. Fit extended model including region 
5. Print coefficients

In [15]:
%pip install statsmodels

Defaulting to user installation because normal site-packages is not writeable
Collecting statsmodels
  Downloading statsmodels-0.14.6-cp313-cp313-win_amd64.whl.metadata (9.8 kB)
Downloading statsmodels-0.14.6-cp313-cp313-win_amd64.whl (9.5 MB)
   ---------------------------------------- 0.0/9.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.5 MB ? eta -:--:--
   - -------------------------------------- 0.3/9.5 MB ? eta -:--:--
   - -------------------------------------- 0.3/9.5 MB ? eta -:--:--
   - -------------------------------------- 0.3/9.5 MB ? eta -:--:--
   - -------------------------------------- 0.3/9.5 MB ? eta -:--:--
   -- ------------------------------------- 0.5/9.5 MB 373.5 kB/s eta 0:00:25
   -- ------------------------------------- 0.5/9.5 MB 37


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\ADMIN\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [16]:
import pandas as pd
import statsmodels.formula.api as smf

# 1) Fit model
m1 = smf.ols("profit ~ marketing_spend + employees", data=df).fit()

# 2) Display summary
print(m1.summary())

# 3) Extract coefficients and R-squared
coef1 = m1.params
r2_1 = m1.rsquared

print("Coefficients:\n", coef1)
print("R-squared:", r2_1)

                            OLS Regression Results                            
Dep. Variable:                 profit   R-squared:                       0.984
Model:                            OLS   Adj. R-squared:                  0.979
Method:                 Least Squares   F-statistic:                     214.1
Date:               T3, 30 Thg12 2025   Prob (F-statistic):           5.27e-07
Time:                        11:19:46   Log-Likelihood:                -26.544
No. Observations:                  10   AIC:                             59.09
Df Residuals:                       7   BIC:                             60.00
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           9.5250     10.217     

  return hypotest_fun_in(*args, **kwds)


In [17]:
# 4) Fit extended model including region
m2 = smf.ols("profit ~ marketing_spend + employees + region", data=df).fit()

# 5) Print coefficients
print("Extended model coefficients:\n", m2.params)

Extended model coefficients:
 Intercept          10.004141
region[T.North]     1.089027
region[T.South]     2.648033
region[T.West]      6.163561
marketing_spend     2.233954
employees          -4.180124
dtype: float64


Exercise 4 – scikit-learn Linear Regression 
Tasks: 
1. Train-test split (70/30, random_state=42) 
2. Train LinearRegression model 
3. Predict on test set 
4. Compute MSE and R

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 1) Select X, y
X = df[["marketing_spend", "employees"]]
y = df["profit"]

# 1) Train-test split (70/30)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42
)

# 2) Train LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

# 3) Predict on test set
y_pred = model.predict(X_test)

# 4) Compute MSE and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("MSE:", mse)
print("R-squared:", r2)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)  # [b_marketing_spend, b_employees]

X_train shape: (7, 2)
X_test shape: (3, 2)
MSE: 26.217860082304572
R-squared: 0.8262439317078489
Intercept: -1.3000000000000114
Coefficients: [1.06       3.05555556]


Exercise 5 – scikit-learn Pipeline 
Tasks: 
1. Build Pipeline with StandardScaler and LinearRegression 
2. Fit on training data 
3. Evaluate with MSE and R

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

X = df[["marketing_spend", "employees"]]
y = df["profit"]

# Train-test split (70/30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42
)

# 1) Build pipeline
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LinearRegression())
])

# 2) Fit on training data
pipe.fit(X_train, y_train)

# 3) Evaluate on test set
y_pred = pipe.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("R-squared:", r2)

MSE: 26.21786008230458
R-squared: 0.8262439317078489
