In [4]:
import pandas as pd

In [5]:
raw_data = pd.read_csv('datasets/top20_PED.csv')

In [23]:
data = raw_data[raw_data['StockCode'] == '85099B']
data.shape

(204, 3)

In [24]:
# Create inputs (PED values for every 5 days) and outputs (PED value of the next day)
input_output_data = []
for i in range(len(data) - 5):
    inputs = data['PED'].iloc[i:i+5].values  # Get 5 consecutive PED values
    output = data['PED'].iloc[i+5]          # Get the PED value of the next day
    input_output_data.append((inputs, output))

# Convert to a DataFrame for better visualization
input_output_df = pd.DataFrame(input_output_data, columns=['Inputs', 'Output'])
input_output_df.head()

Unnamed: 0,Inputs,Output
0,"[-501.4285714285717, -6.763636363636357, -83.9...",-4.29
1,"[-6.763636363636357, -83.9298245614038, -99.53...",-6.438436
2,"[-83.9298245614038, -99.5312499999999, -843.37...",-17.962209
3,"[-99.5312499999999, -843.3795146871022, -4.289...",-5.489
4,"[-843.3795146871022, -4.289999999999999, -6.43...",-8.616279


In [8]:
import mlflow
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

In [9]:
mlflow.set_experiment("PED")

<Experiment: artifact_location='mlflow-artifacts:/593083829582539802', creation_time=1743644871081, experiment_id='593083829582539802', last_update_time=1743644871081, lifecycle_stage='active', name='PED', tags={}>

In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
import numpy as np

# Prepare the data for LSTM
X = np.array([np.array(inputs) for inputs, _ in input_output_data])
y = np.array([output for _, output in input_output_data])

# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Normalize the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
y_train = scaler.fit_transform(y_train.reshape(-1, 1))
y_val = scaler.transform(y_val.reshape(-1, 1))
y_test = scaler.transform(y_test.reshape(-1, 1))

# Reshape the input data to be 3D (samples, timesteps, features) for LSTM
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))


In [26]:
params = {
    "units": 50,
    "dropout_rate": 0.2,
    "batch_size": 32,
    "epochs": 50,
    "output_dim": 1,
}

In [27]:
# Build the LSTM model
model = Sequential([
    LSTM(params['units'], activation='relu', input_shape=(X_train.shape[1], 1)),
    Dropout(params['dropout_rate']),
    Dense(params['output_dim']),
])

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Train the model
history = model.fit(X_train, y_train, epochs=params["epochs"], batch_size=params["batch_size"], validation_data=(X_val, y_val))

Epoch 1/50


  super().__init__(**kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 125ms/step - loss: 0.4682 - val_loss: 0.3711
Epoch 2/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 0.3945 - val_loss: 0.3111
Epoch 3/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 0.3103 - val_loss: 0.2518
Epoch 4/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - loss: 0.2583 - val_loss: 0.1946
Epoch 5/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: 0.1995 - val_loss: 0.1411
Epoch 6/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.1294 - val_loss: 0.0965
Epoch 7/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.0761 - val_loss: 0.0695
Epoch 8/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.0457 - val_loss: 0.0699
Epoch 9/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1

In [28]:
from sklearn.metrics import r2_score

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step
R-squared: -0.07584696112407241


In [None]:
# Denormalize y_test
y_test_denormalized = scaler.inverse_transform(y_test)
y_test_denormalized

In [None]:
y_pred_denormalized = scaler.inverse_transform(y_pred)
y_pred_denormalized 

In [1]:
with mlflow.start_run():
    # Log parameters
    mlflow.log_param(params)  

    # Log metrics
    # mlflow.log_metric("test_loss", history.history['val_loss'][-1])
    mlflow.log_metric("r2_score", r2)

    # Log the model
    mlflow.keras.log_model(model, "85099B")

    # # Set a tag that we can use to remind ourselves what this run was for
    # mlflow.set_tag("Training Info", "Basic LR model for iris data")

    # # Infer the model signature
    # signature = infer_signature(X_train, lr.predict(X_train))

    # # Log the model
    # model_info = mlflow.sklearn.log_model(
    #     sk_model=lr,
    #     artifact_path="iris_model",
    #     signature=signature,
    #     input_example=X_train,
    #     registered_model_name="tracking-quickstart",
    # )

NameError: name 'mlflow' is not defined

In [None]:
# Not enough training data

In [8]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((191, 5, 1), (191,), (48, 5, 1), (48,))

In [5]:
import numpy as np


In [10]:
data = raw_data[raw_data['StockCode'] == '85123A'][['UnitPrice', 'Quantity', 'InvoiceDate']]
data['LogUnitPrice'] = np.log(data['UnitPrice'].abs().replace(0, np.nan).dropna())
data['LogQuantity'] = np.log(data['Quantity'].abs().replace(0, np.nan).dropna())
data.head()
# data.shape

Unnamed: 0,UnitPrice,Quantity,InvoiceDate,LogUnitPrice,LogQuantity
3982,2.71,441,2010-12-01,0.996949,6.089045
3983,2.676316,309,2010-12-02,0.984441,5.733341
3984,2.95,20,2010-12-03,1.081805,2.995732
3985,2.888462,198,2010-12-05,1.060724,5.288267
3986,2.877273,136,2010-12-06,1.056843,4.912655


In [None]:
input_output_array = data[['LogUnitPrice', 'LogQuantity']].dropna().to_numpy()
input_output_array[:10]
# input_output_array = data[['UnitPrice', 'Quantity']].to_numpy()
# input_output_array[:10]

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into input (X) and output (y)
X = input_output_array[:, 0].reshape(-1, 1)  # LogUnitPrice as input
y = input_output_array[:, 1]  # LogQuantity as output

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")