In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Load the dataset
df = pd.read_csv("worldometer_coronavirus_daily_data.csv")  # Update with your dataset path

# Inspect the first few rows and the columns
print(df.head())
print(df.columns)

# Rename columns for Prophet model
df.rename(columns={"date": "ds", "cumulative_total_cases": "y"}, inplace=True)

# Handle missing values
df.fillna(method="ffill", inplace=True)

# Create new features
# Ensure 'y' exists after renaming
if "y" in df.columns:
    df["moving_avg_7"] = df["y"].rolling(window=7).mean()  # 7-day moving average
    df["moving_avg_14"] = df["y"].rolling(window=14).mean()  # 14-day moving average
    df["lag_1"] = df["y"].shift(1)  # 1-day lag
    df["lag_7"] = df["y"].shift(7)  # 7-day lag

    # Drop rows with NaN values that may result from rolling operations
    df.dropna(inplace=True)

    # Scale the features
    features = df[["y", "moving_avg_7", "moving_avg_14", "lag_1", "lag_7"]].values
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features)

    # Prepare the dataset for the CNN model
    def create_dataset(data, time_step=1):
        X, y = [], []
        for i in range(len(data) - time_step - 1):
            a = data[i : (i + time_step)]
            X.append(a)
            y.append(data[i + time_step, 0])  # Target is the first column (y)
        return np.array(X), np.array(y)

    time_step = 7  # Define the time step for the CNN input
    X, y = create_dataset(features_scaled, time_step)
    X = X.reshape(X.shape[0], X.shape[1], X.shape[2])  # Reshape for CNN input

    print("Shape of X:", X.shape)
    print("Shape of y:", y.shape)

else:
    print("Column 'y' not found. Check the renaming step.")


        date      country  cumulative_total_cases  daily_new_cases  \
0  2020-2-15  Afghanistan                     0.0              NaN   
1  2020-2-16  Afghanistan                     0.0              NaN   
2  2020-2-17  Afghanistan                     0.0              NaN   
3  2020-2-18  Afghanistan                     0.0              NaN   
4  2020-2-19  Afghanistan                     0.0              NaN   

   active_cases  cumulative_total_deaths  daily_new_deaths  
0           0.0                      0.0               NaN  
1           0.0                      0.0               NaN  
2           0.0                      0.0               NaN  
3           0.0                      0.0               NaN  
4           0.0                      0.0               NaN  
Index(['date', 'country', 'cumulative_total_cases', 'daily_new_cases',
       'active_cases', 'cumulative_total_deaths', 'daily_new_deaths'],
      dtype='object')
Shape of X: (184766, 7, 5)
Shape of y: (184766,)


In [3]:
import prophet

print(prophet.__version__)


1.1.6


In [4]:
from prophet import Prophet

# Create and fit the Prophet model
prophet_model = Prophet()
prophet_model.fit(df[["ds", "y"]])

# Generate future dates for prediction
future = prophet_model.make_future_dataframe(periods=30)  # Predict for 30 days
forecast = prophet_model.predict(future)


19:13:29 - cmdstanpy - INFO - Chain [1] start processing
19:13:45 - cmdstanpy - INFO - Chain [1] done processing


In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from keras_tuner import Hyperband


# Define the CNN model with hyperparameter tuning
def build_model(hp):
    model = Sequential()
    model.add(
        Conv1D(
            filters=hp.Int("filters", 32, 128, step=16),
            kernel_size=hp.Choice("kernel_size", [2, 3, 5]),
            activation="relu",
            input_shape=(X.shape[1], X.shape[2]),
        )
    )
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(units=hp.Int("units", 32, 128, step=16), activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(1))  # Output layer
    model.compile(optimizer="adam", loss="mean_squared_error")
    return model


# Tune hyperparameters using Keras Tuner
tuner = Hyperband(
    build_model,
    objective="val_loss",
    max_epochs=50,
    factor=3,
    directory="my_dir",
    project_name="cnn_tuning",
)
tuner.search(X, y, epochs=50, validation_split=0.2)
best_model = tuner.get_best_models(num_models=1)[0]

# Train the best model
best_model.fit(X, y, epochs=50, validation_split=0.2)


Trial 90 Complete [00h 08m 18s]
val_loss: 6.641453364863992e-05

Best val_loss So Far: 3.391817517695017e-05
Total elapsed time: 02h 52m 12s
Epoch 1/50
[1m4620/4620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 1.9545e-05 - val_loss: 6.1509e-05
Epoch 2/50
[1m4620/4620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 1.9611e-05 - val_loss: 4.1941e-05
Epoch 3/50
[1m4620/4620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 2.1339e-05 - val_loss: 8.7546e-05
Epoch 4/50
[1m4620/4620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 2.7735e-05 - val_loss: 5.6807e-05
Epoch 5/50
[1m4620/4620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 2.1417e-05 - val_loss: 4.8077e-05
Epoch 6/50
[1m4620/4620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 2.4161e-05 - val_loss: 6.0701e-05
Epoch 7/50
[1m4620/4620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 

<keras.src.callbacks.history.History at 0x233854eb610>

In [11]:
best_model.add(Dense(1))

In [12]:
# Assuming your model outputs a single value
cnn_predictions = best_model.predict(last_days)
cnn_predictions = cnn_predictions.reshape(-1, 1)  # Reshape if output is 1D
cnn_predictions = scaler.inverse_transform(
    cnn_predictions
)  # Inverse transform to original scale


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step


ValueError: non-broadcastable output operand with shape (1,1) doesn't match the broadcast shape (1,5)

In [14]:
# Predict using the best CNN model
last_days = features_scaled[-time_step:].reshape(
    (1, time_step, features_scaled.shape[1])
)

# Get predictions from the CNN model
cnn_predictions = best_model.predict(last_days)

# Assuming the model outputs 1 prediction, create an array of 5 elements
cnn_predictions = np.zeros(
    (1, features_scaled.shape[1])
)  # Create a placeholder for the right shape
cnn_predictions[0, 0] = best_model.predict(last_days)  # Fill in the prediction

# Inverse transform to original scale
cnn_predictions = scaler.inverse_transform(cnn_predictions)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step


In [17]:
# Predict using the best CNN model
last_days = features_scaled[-time_step:].reshape(
    (1, time_step, features_scaled.shape[1])
)

cnn_predictions = best_model.predict(last_days)

# Check the shape of cnn_predictions
print(
    "Shape of cnn_predictions:", cnn_predictions.shape
)  # Add this line to check the output shape

# If it is a single prediction, you may not need to reshape it.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Shape of cnn_predictions: (1, 1)


In [19]:
# Predict using the best CNN model
last_days = features_scaled[-time_step:].reshape(
    (1, time_step, features_scaled.shape[1])
)

cnn_predictions = best_model.predict(last_days)

# Check the shape of cnn_predictions
print("Shape of cnn_predictions:", cnn_predictions.shape)

# Since the output shape is (1, 1), extract the predicted value
cnn_prediction_value = cnn_predictions[0, 0]  # Get the single prediction value

# Expand to match the shape of forecast["yhat"].values[-30:]
cnn_prediction_expanded = np.full((30,), cnn_prediction_value)

# Combine predictions and evaluate
combined_predictions = forecast["yhat"].values[-30:] + cnn_prediction_expanded

# Calculate RMSE and MAE
from sklearn.metrics import mean_squared_error, mean_absolute_error

rmse = np.sqrt(
    mean_squared_error(df["y"][-30:], combined_predictions)
)  # Use the last 30 days for evaluation
mae = mean_absolute_error(df["y"][-30:], combined_predictions)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Shape of cnn_predictions: (1, 1)
RMSE: 2202782.8987099985
MAE: 2202285.9743201644


In [20]:
from sklearn.metrics import r2_score

# Calculate R²
r2 = r2_score(df["y"][-30:], combined_predictions)

print(f"R²: {r2}")


R²: -15028624.219175577
