In [0]:
def forecast_temp_data(Hourly_fact):
    from pyspark.sql import functions as F
    from pyspark.sql.window import Window
    
    
    #Add is_forecasted_data Column
    if 'is_forecasted_data' not in Hourly_fact.columns:
        Hourly_fact = Hourly_fact.withColumn('is_forecasted_data', lit('False'))

    # Create a Window specification for the row_number function
    window_spec = Window.partitionBy('city_id').orderBy(F.desc('created_on'))

    # Create the cte DataFrame with row_number
    cte = Hourly_fact.withColumn('row_num', F.row_number().over(window_spec)).where('row_num <= 5')

    # Perform the aggregation and select statements
    result_df = cte.groupBy('city_id') \
        .agg(
           
            F.first('dateID').alias('dateID'),
            (F.first('timeID') + 1).alias('timeID'),
            F.first('dt').alias('dt'),
            F.round(F.avg('temp').cast('double'), 2).alias('temp'),
            F.first('temp_min').alias('temp_min'),
            F.first('temp_max').alias('temp_max'),
            F.first('visibility').alias('visibility'),
            F.first('pressure').alias('pressure'),
            F.first('humidity').alias('humidity'),
            F.first('wind_deg').alias('wind_deg'),
            F.first('wind_gust').alias('wind_gust'),
            F.first('created_on').alias('created_on'),
            F.first('load_run_id').alias('load_run_id'),
            F.first('created_by').alias('created_by'),
            F.first('is_forecasted_data').alias('is_forecasted_data')
        )
    
    # Select the columns in the desired order
    result_df = result_df.select('dateID', 'timeID', 'city_id', 'dt', 'temp', 'temp_min', 'temp_max', 'visibility',
                           'pressure', 'humidity', 'wind_deg', 'wind_gust', 'created_on', 'load_run_id',
                           'created_by', 'is_forecasted_data')

    Hourly_fact=Hourly_fact.union(result_df)

    return Hourly_fact.orderBy(F.col('timeID').desc())
