In [14]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA

In [2]:
# Global Parameters
time_seires_num = 30
country_you_want_to_predict = "china"
case_you_want_to_predict = "confirmed"
prediction_date = "2022-09-01" # The date should not exceed 2023-03-10

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Check if a SparkContext exists, if not, create one
try:
    sc = SparkContext.getOrCreate()
except:
    sc = SparkContext("local", "ARIMA")

# Create a Spark session
spark = SparkSession(sc)
df = spark.read.csv('covid-19.csv', header=True, inferSchema=True)
df = df.filter(df['country']==country_you_want_to_predict).select("date", case_you_want_to_predict)
df = df.withColumnRenamed(case_you_want_to_predict, "cases")

In [4]:
df.show(5)

+----------+-----+
|      date|cases|
+----------+-----+
|2020-01-22|  548|
|2020-01-23|  643|
|2020-01-24|  920|
|2020-01-25| 1406|
|2020-01-26| 2075|
+----------+-----+
only showing top 5 rows



In [5]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from datetime import datetime, timedelta

def predict_time_series_arima(prediction_date):
    data = df
    # Calculate the difference
    prediction_date_dt = datetime.strptime(prediction_date, "%Y-%m-%d")
    first_date_in_training_set = prediction_date_dt - timedelta(days=time_seires_num)
    last_date_in_training_set = prediction_date_dt - timedelta(days=1)

    # Get the true data
    true_value = data.filter(col("date") == prediction_date).select("cases").collect()[0]["cases"]
    
    # Filter the date in data
    data = data.filter((col("date")>=first_date_in_training_set) & (col("date")<=last_date_in_training_set))
    
    # Assemble features
    assembler = VectorAssembler(inputCols=["cases"], outputCol="features")
    assembled_data = assembler.transform(data)

    # Extract features as a NumPy array
    np_data = np.array(assembled_data.select("features").rdd.map(lambda x: x[0].toArray()[0]).collect())

    # Fit an ARIMA model
    order = (1, 1, 1)  # Example order, you may need to tune this based on your data
    model = ARIMA(np_data, order=order)
    fit_model = model.fit()

    prediction_value = int(fit_model.forecast(1)[0])
    error = abs(prediction_value - true_value)/true_value*100
    print(f"Predicted value for {prediction_date}: {prediction_value}; True value for {prediction_date}: {true_value}; The error is: {error}%")
    return error

In [6]:
predict_time_series_arima(prediction_date)

Predicted value for 2022-09-01: 2505900; True value for 2022-09-01: 2510703; The error is: 0.19130100214959714%


  warn('Non-stationary starting autoregressive parameters'


0.19130100214959714

In [7]:
# Specify the list of dates for prediction
date_list = ["2022-09-01", "2022-10-01", "2022-11-01"]

In [8]:
for date in date_list:
    err = predict_time_series_arima(date)
    print(err)



Predicted value for 2022-09-01: 2505900; True value for 2022-09-01: 2510703; The error is: 0.19130100214959714%
0.19130100214959714
Predicted value for 2022-10-01: 2762240; True value for 2022-10-01: 2762150; The error is: 0.0032583313723005634%
0.0032583313723005634
Predicted value for 2022-11-01: 2958628; True value for 2022-11-01: 2959481; The error is: 0.02882262126366076%
0.02882262126366076




In [15]:
# def predict_time_series_arima_parallel(date, df, time_series_num):
#     return predict_time_series_arima(df, date, time_series_num)

# # Assuming you have already defined df, time_series_num, and date_list
# # Replace 'local[*]' with your Spark master URL if you are running on a cluster
# spark = SparkSession.builder.master('local[*]').appName('TimeSeriesPrediction').getOrCreate()

# # Use parallelize to create an RDD from the list of dates
# dates_rdd = spark.sparkContext.parallelize(date_list)

# # Use map to apply the predict_time_series_arima_parallel function to each date in parallel
# errors_rdd = dates_rdd.map(lambda date: predict_time_series_arima_parallel(date, df, time_series_num))

# # Collect the errors
# collected_errors = errors_rdd.collect()

# # Display the errors
# for i, err in enumerate(collected_errors):
#     print(f"Error for {date_list[i]}: {err}")

# def predict_time_series_arima_parallel(date, df, time_series_num):
#     return predict_time_series_arima(df, date, time_series_num)

# # Assuming you have already defined df, time_series_num, and date_list
# # Replace 'local' with your Spark master URL if you want to run on a Spark cluster
# spark = SparkSession.builder.master('local').appName('TimeSeriesPrediction').getOrCreate()

# # Use parallelize to create an RDD from the list of dates
# dates_rdd = spark.sparkContext.parallelize(date_list)

# # Use map to apply the predict_time_series_arima_parallel function to each date in parallel
# errors_rdd = dates_rdd.map(lambda date: predict_time_series_arima_parallel(date, df, time_series_num))

# # Collect the errors
# collected_errors = errors_rdd.collect()

# # Display the errors
# for i, err in enumerate(collected_errors):
#     print(f"Error for {date_list[i]}: {err}")

# Register the UDF
predict_udf = udf(lambda date: predict_time_series_arima(date), StringType())

# Assuming you have already defined df, time_series_num, and date_list
# Replace 'local' with your Spark master URL if you want to run on a Spark cluster
spark = SparkSession.builder.master('local').appName('TimeSeriesPrediction').getOrCreate()

# Convert date_list to a DataFrame with a single column 'date'
date_df = spark.createDataFrame(date_list, StringType()).withColumnRenamed("value", "date")

# Apply the UDF to the DataFrame
result_df = date_df.withColumn("prediction_error", predict_udf("date"))

# Show the result DataFrame
result_df.show()

# Stop the SparkSession
spark.stop()

Traceback (most recent call last):
  File "/usr/local/spark/python/pyspark/serializers.py", line 459, in dumps
    return cloudpickle.dumps(obj, pickle_protocol)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/pyspark/cloudpickle/cloudpickle_fast.py", line 73, in dumps
    cp.dump(obj)
  File "/usr/local/spark/python/pyspark/cloudpickle/cloudpickle_fast.py", line 632, in dump
    return Pickler.dump(self, obj)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/pyspark/context.py", line 466, in __getnewargs__
    raise PySparkRuntimeError(
pyspark.errors.exceptions.base.PySparkRuntimeError: [CONTEXT_ONLY_VALID_ON_DRIVER] It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.


PicklingError: Could not serialize object: PySparkRuntimeError: [CONTEXT_ONLY_VALID_ON_DRIVER] It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.