In [4]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA

In [5]:
# Global Parameters
time_seires_num = 30
country_you_want_to_predict = "china"
case_you_want_to_predict = "confirmed"
prediction_date = "2022-09-01" # The date should not exceed 2023-03-10

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Check if a SparkContext exists, if not, create one
try:
    sc = SparkContext.getOrCreate()
except:
    sc = SparkContext("local", "ARIMA")

# Create a Spark session
spark = SparkSession(sc)
df = spark.read.csv('covid-19.csv', header=True, inferSchema=True)
df = df.filter(df['country']==country_you_want_to_predict).select("date", case_you_want_to_predict)
df = df.withColumnRenamed(case_you_want_to_predict, "cases")

In [7]:
import numpy as np

In [8]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from datetime import datetime, timedelta

def predict_time_series_arima(prediction_date):
    data = df
    # Calculate the difference
    prediction_date_dt = datetime.strptime(prediction_date, "%Y-%m-%d")
    first_date_in_training_set = prediction_date_dt - timedelta(days=time_seires_num)
    last_date_in_training_set = prediction_date_dt - timedelta(days=1)

    # Get the true data
    true_value = data.filter(col("date") == prediction_date).select("cases").collect()[0]["cases"]
    
    # Filter the date in data
    data = data.filter((col("date")>=first_date_in_training_set) & (col("date")<=last_date_in_training_set))
    
    # Assemble features
    assembler = VectorAssembler(inputCols=["cases"], outputCol="features")
    assembled_data = assembler.transform(data)

    # Extract features as a NumPy array
    np_data = np.array(assembled_data.select("features").rdd.map(lambda x: x[0].toArray()[0]).collect())

    # Fit an ARIMA model
    order = (1, 1, 1)  # Example order, you may need to tune this based on your data
    model = ARIMA(np_data, order=order)
    fit_model = model.fit()

    prediction_value = int(fit_model.forecast(1)[0])
    error = abs(prediction_value - true_value)/true_value*100
    print(f"Predicted value for {prediction_date}: {prediction_value}; True value for {prediction_date}: {true_value}; The error is: {error}%")
    return error

In [10]:
predict_time_series_arima(prediction_date)

Predicted value for 2022-09-01: 2505900; True value for 2022-09-01: 2510703; The error is: 0.19130100214959714%


  warn('Non-stationary starting autoregressive parameters'


0.19130100214959714