### Linjär regression


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import r2_score
from pathlib import Path

In [3]:

DATA_PATH = Path.cwd().parents[1] / "data"   #upp två nivåer → repo-rot → /data
print(DATA_PATH.resolve(), DATA_PATH.exists())

df = pd.read_csv(DATA_PATH / "advertising.csv", index_col = 0) #index_col = kolumn 0(1...) är index. Parsar datum för
df

C:\Users\susan\KodPython\ai_engineering_susanne_wenblad\data True


Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9
...,...,...,...,...
196,38.2,3.7,13.8,7.6
197,94.2,4.9,8.1,9.7
198,177.0,9.3,6.4,12.8
199,283.6,42.0,66.2,25.5


In [4]:
df.shape

(200, 4)

In [5]:

print(f"{df.shape[0]} samples")
print(f"{df.shape[1] - 1} features")
print("sales column is our label/target")

200 samples
3 features
sales column is our label/target


##### Dela upp datan i två delar - X och y
X == värdet   
y == kolumner   
   
df == vilken dataframe vi använder

In [6]:
X, y = df.drop("sales", axis = 1), df["sales"]



X = feature matrix / feature DataFrame

In [8]:
X.head()

Unnamed: 0,TV,radio,newspaper
1,230.1,37.8,69.2
2,44.5,39.3,45.1
3,17.2,45.9,69.3
4,151.5,41.3,58.5
5,180.8,10.8,58.4


y = Pandas series

In [9]:
y.head()

1    22.1
2    10.4
3     9.3
4    18.5
5    12.9
Name: sales, dtype: float64

## Scikit learn-steps

### Step 1 --> Train | Test - split

In [10]:
from sklearn.model_selection import train_test_split

# Vi delar upp så att modellen tränas på en del (train)
# och utvärderas på helt osedda rader (test). Det mäter generalisering.
X_train, X_test, y_train, y_test = train_test_split(
    X,              # indata (features)
    y,              # målvariabel
    test_size=0.33, # ~33% av raderna går till test (resten till train)
    random_state=42 # låser "slumpen" -> samma split varje gång = reproducerbart
    # shuffle=True  # (standard) blandar rader innan delning
)

# I en interaktiv miljö (Jupyter/VS Code-prompt) räcker det att skriva variabelnamnet för att visa:
X_train

# I en .py-fil behöver du skriva:
# print(X_train)

Unnamed: 0,TV,radio,newspaper
43,293.6,27.7,1.8
190,18.7,12.1,23.4
91,134.3,4.9,9.3
137,25.6,39.0,9.3
52,100.4,9.6,3.6
...,...,...,...
107,25.0,11.0,29.7
15,204.1,32.9,46.0
93,217.7,33.5,59.0
180,165.6,10.0,17.6


In [11]:
X_test.shape

(66, 3)

In [12]:
y_train.shape, y_test.shape

((134,), (66,))

### Step 2 --> Feature scaling

In [13]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)



0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [14]:
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

scaled_X_train.shape, scaled_X_test.shape

((134, 3), (66, 3))

In [15]:
scaled_X_train.min(), scaled_X_train.max()

(np.float64(0.0), np.float64(1.0))

In [16]:
scaled_X_test.min(), scaled_X_test.max()

(np.float64(0.005964214711729622), np.float64(1.1302186878727631))

### Step 3 --> Linear Regression

In [17]:
from sklearn.linear_model import LinearRegression


model = LinearRegression()
model

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [18]:
model.fit(scaled_X_train, y_train)
model.coef_

array([13.20747617,  9.75285112,  0.61108329])

In [19]:
model.intercept_

np.float64(2.7911595196243653)

### Step 4 --> Predicition

#### Prediction on train data

In [20]:
test_sample_features = scaled_X_test[0].reshape(1,-1)
test_sample_target = y_test.values[0]

test_sample_features, test_sample_target

(array([[0.54988164, 0.63709677, 0.52286282]]), np.float64(16.9))

In [21]:
model.predict(test_sample_features)

array([16.58673085])

In [22]:
test_sample_target

np.float64(16.9)

#### Prediction on testdata

In [23]:
y_pred = model.predict(scaled_X_test)
y_pred.shape

(66,)

In [24]:
y_test.shape

(66,)

In [25]:
y_pred[:5]

array([16.58673085, 21.18622524, 21.66752973, 10.81086512, 22.25210881])

In [26]:
y_test[:5].values

array([16.9, 22.4, 21.4,  7.3, 24.7])

### Step 5 --> Evaluation -- MAE, MSE, RMSE

In [27]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_pred, y_pred)
rmse = np.sqrt(mse)

print(f"{mae = :.2f}")
print(f"{mse = :.2f}")
print(f"{rmse = :.2f}")

mae = 0.00
mse = 3.73
rmse = 1.93
