# Time Series Forecasting for Temperature Data

This notebook demonstrates time series forecasting using NASA temperature data. We'll explore the data, visualize patterns, and implement both statistical (SARIMA) and deep learning (LSTM) forecasting models.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.statespace.sarimax import SARIMAX
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

## Data Loading and Exploration

In [None]:
# Load the dataset
dataframe = pd.read_csv("data/power_nasa.csv")
dataframe.info()

# Display the first few rows to understand the data structure
display(dataframe.head())

# Basic statistics
display(dataframe.describe())

## Data Visualization - Yearly Temperature Patterns

In [None]:
# Group the dataframe by year
grouped_df = dataframe.groupby('YEAR')

plt.figure(figsize=(20, 6))
# Plot the graph for each year
for year, group in grouped_df:
    plt.plot(group['DOY'], group['T2M'], label=str(year))

# Set the labels and title
plt.xlabel('Day of Year')
plt.ylabel('Temperature (T2M)')
plt.title('Temperature vs Day of Year for Every Year')

# Show the plot
plt.show()


## Single Year Temperature Analysis

In [None]:
import matplotlib.pyplot as plt

# Filter the dataframe for the year 2020
year_2020_df = dataframe[dataframe['YEAR'] == 2020]

print(f"Data points for year 2020: {len(year_2020_df)}")
# Plot the temperature values
plt.figure(figsize=(12, 6))
plt.plot(year_2020_df['DOY'], year_2020_df['T2M'])
plt.xlabel('Day of Year')
plt.ylabel('Temperature (T2M)')
plt.title('Temperature vs Day of Year for the Year 2020')
plt.grid(True, alpha=0.3)
plt.show()

## Correlation Analysis

In [None]:
# Assuming the correlation matrix is stored in the variable 'correlation_matrix'
# You can replace it with the actual variable name in your notebook

correlation_matrix = dataframe.corr()

# Display the correlation matrix
print(correlation_matrix)

# Plot the correlation matrix as a heatmap
plt.figure(figsize=(8, 6))
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='nearest')
plt.colorbar()
plt.title('Correlation Matrix')
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)

# Add correlation values as text annotations
for i in range(len(correlation_matrix.columns)):
    for j in range(len(correlation_matrix.columns)):
        plt.text(j, i, f'{correlation_matrix.iloc[i, j]:.2f}', 
                 ha='center', va='center', 
                 color='white' if abs(correlation_matrix.iloc[i, j]) > 0.5 else 'black')
plt.tight_layout()
plt.show()


## Data Preparation for Time Series Analysis

In [None]:
# Assuming your DataFrame is named df
df = dataframe.copy()

# Convert the date columns to a datetime object (if not already done)
df['DATE'] = pd.to_datetime(df[['YEAR', 'DOY']].astype(str).apply('-'.join, 1), format='%Y-%j')
df.set_index('DATE', inplace=True)

# Drop unnecessary columns for forecasting
df.drop(columns=['YEAR', 'DOY'], inplace=True)

# Check the dataframe
print(df.head())

# Display more info about the prepared dataframe
print(f"\nDataframe shape: {df.shape}")
print("\nDataframe index info:")
print(f"Start date: {df.index.min()}")
print(f"End date: {df.index.max()}")
print(f"Total days: {len(df)}")

## SARIMA Time Series Forecasting

In [None]:
try:
   

    # Split data into train and test
    train_size = int(len(df) * 0.8)
    train_data = df[:train_size]
    test_data = df[train_size:]

    print(f"Training data: {train_data.index.min()} to {train_data.index.max()} ({len(train_data)} days)")
    print(f"Testing data: {test_data.index.min()} to {test_data.index.max()} ({len(test_data)} days)")

    # Define the model
    model = SARIMAX(train_data['T2M'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))

    # Fit the model
    model_fit = model.fit(disp=False)

    # Summary of the model
    print(model_fit.summary())

    # Forecast for the test period
    forecast = model_fit.forecast(steps=len(test_data))
    forecast_index = test_data.index

    # Plot the forecast vs actual
    plt.figure(figsize=(12, 6))
    plt.plot(train_data.index, train_data['T2M'], label='Training Data', color='blue')
    plt.plot(test_data.index, test_data['T2M'], label='Actual Test Data', color='green')
    plt.plot(forecast_index, forecast, label='SARIMA Forecast', color='red', linestyle='--')
    plt.title('SARIMA Model: Temperature Forecast vs Actual')
    plt.xlabel('Date')
    plt.ylabel('Temperature (T2M)')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

    # Evaluate the model
    mse_sarima = mean_squared_error(test_data['T2M'], forecast)
    mae_sarima = mean_absolute_error(test_data['T2M'], forecast)
    rmse_sarima = np.sqrt(mse_sarima)
    print(f'Mean Squared Error: {mse_sarima:.4f}')
    print(f'Mean Absolute Error: {mae_sarima:.4f}')
    print(f'Root Mean Squared Error: {rmse_sarima:.4f}')
    
except ModuleNotFoundError as e:

    print(f"Error: {e}")
    print("\nPlease run the cell above to install the required packages, then restart the kernel and run this cell again.")

In [None]:
try:
   

    # Scale the data
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_data = scaler.fit_transform(df[["T2M"]])

    # Prepare the data for LSTM
    def create_dataset(data, time_step=1):
        X, Y = [], []
        for i in range(len(data) - time_step):
            X.append(data[i:i + time_step])
            Y.append(data[i + time_step])
        return np.array(X), np.array(Y)

    # Define the time step (window size)
    time_step = 30

    # Create the dataset
    X, Y = create_dataset(scaled_data, time_step)

    # Define the train-test split point
    train_size = int(len(X) * 0.8)
    X_train, X_test = X[:train_size], X[train_size:]
    Y_train, Y_test = Y[:train_size], Y[train_size:]

    print(f"X_train shape: {X_train.shape}, Y_train shape: {Y_train.shape}")
    print(f"X_test shape: {X_test.shape}, Y_test shape: {Y_test.shape}")

    # Build the LSTM model
    model = Sequential()
    model.add(LSTM(64, return_sequences=True, input_shape=(time_step, 1)))
    model.add(Dropout(0.2))
    model.add(LSTM(64, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1))

    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error')

    # Early stopping to prevent overfitting
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True
    )

    # Train the model with early stopping and validation split
    history = model.fit(
        X_train, Y_train,
        batch_size=32,
        epochs=50,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=1
    )

    # Plot the training history
    plt.figure(figsize=(12, 6))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('LSTM Model Training History')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

    # Get predictions
    train_predict = model.predict(X_train)
    test_predict = model.predict(X_test)

    # Inverse transform to get actual temperature values
    train_predict = scaler.inverse_transform(train_predict)
    test_predict = scaler.inverse_transform(test_predict)
    Y_train_actual = scaler.inverse_transform(Y_train)
    Y_test_actual = scaler.inverse_transform(Y_test)

    # Create timestamps for plotting
    train_timestamps = df.index[time_step:train_size+time_step]
    test_timestamps = df.index[train_size+time_step:train_size+time_step+len(test_predict)]

    # Plot the predictions
    plt.figure(figsize=(14, 7))
    plt.plot(df.index, df['T2M'], 'b-', label='Actual Temperature', alpha=0.5)
    plt.plot(train_timestamps, train_predict, 'r--', label='Training Predictions')
    plt.plot(test_timestamps, test_predict, 'g--', label='Testing Predictions')
    plt.title('LSTM Model: Temperature Predictions vs Actual')
    plt.xlabel('Date')
    plt.ylabel('Temperature (T2M)')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

    # Evaluate the model
    mse_lstm = mean_squared_error(Y_test_actual, test_predict)
    mae_lstm = mean_absolute_error(Y_test_actual, test_predict)
    rmse_lstm = np.sqrt(mse_lstm)
    print(f'Mean Squared Error: {mse_lstm:.4f}')
    print(f'Mean Absolute Error: {mae_lstm:.4f}')
    print(f'Root Mean Squared Error: {rmse_lstm:.4f}')
    
except Exception as e:
    print(f"Error: {e}")
    print("\nThis could be due to missing TensorFlow. If that's the case, please run:")
    print("!pip install tensorflow")

In [None]:
try:
    # Check if the variables exist
    if 'mse_sarima' not in locals() or 'mse_lstm' not in locals():
        # If the model cells failed to run, use placeholder values
        print("Warning: Model metrics not found. Using placeholder values for comparison.")
        # Use placeholder values
        mse_sarima = mae_sarima = rmse_sarima = float('nan')
        mse_lstm = mae_lstm = rmse_lstm = float('nan')
    
    # Create a comparison dataframe
    models = ['SARIMA', 'LSTM']
    metrics = {
        'MSE': [mse_sarima, mse_lstm],
        'MAE': [mae_sarima, mae_lstm],
        'RMSE': [rmse_sarima, rmse_lstm]
    }

    comparison_df = pd.DataFrame(metrics, index=models)
    display(comparison_df)

    # Check if any values are NaN before plotting
    if not comparison_df.isna().any().any():
        # Visualize the comparison
        plt.figure(figsize=(10, 6))
        comparison_df.plot(kind='bar', figsize=(10, 6))
        plt.title('Model Performance Comparison')
        plt.ylabel('Error Value')
        plt.grid(axis='y', alpha=0.3)
        plt.xticks(rotation=0)
        plt.tight_layout()
        plt.show()

        # Conclusion
        print("Conclusion:")
        best_model = "SARIMA" if mse_sarima < mse_lstm else "LSTM"
        print(f"The {best_model} model performed better for this temperature forecasting task.")
        print("This could be due to the nature of temperature data, which typically has strong seasonal patterns.")
    else:
        print("Cannot generate comparison plot due to missing metrics. Please run the model cells successfully first.")
except Exception as e:
    print(f"Error in comparison: {e}")