# Data Acquisition 
Training and test data are acquired from the [ECMWF's ERA 5 dataset](https://cds.climate.copernicus.eu/datasets/reanalysis-era5-single-levels?tab=download). Because each year's data file is downloaded separately, I did not include them to the repo. 

In [1]:
import numpy as np 
import xarray as xr 
import cfgrib 
import pandas as pd 
import torch 
import torch.nn.functional as F
from utilities3 import *
import time 
from matplotlib import pyplot as plt

**Training Data**  
These are the properties of each year's training data 
* 1216 time steps, 69 latitude, 233 longitude, 15 variables 
* 31->48 latitude, 0.25 step size 
* -124->-66 longitude


**10 YEARS OF DATA**  
From 2012-2021 with 4 attributes 

## Data conversion  
The data is downloaded as .grib file, we first need to convert it to .netcdf, then to CSV file. From CSV we can extract the variables/weather attributes into numpy arrays for fast processing. 

In [7]:
for i in range (2012, 2022):
    train= xr.open_dataset(f'/home/liuxiang/10yeardata/training/{i}.grib')
    train.to_netcdf(f'/home/liuxiang/10yeardata/training/CDF/{i}.nc')
    train=xr.open_dataset(f'/home/liuxiang/10yeardata/training/CDF/{i}.nc')
    train= train.to_dataframe()
    train.to_csv(f'/home/liuxiang/10yeardata/training/CSV/{i}.csv')

*Test data*

In [3]:
for i in range(2022,2024):
    test= xr.open_dataset(f'/home/liuxiang/10yeardata/test/{i}.grib')
    test.to_netcdf(f'/home/liuxiang/10yeardata/test/CDF/{i}.nc')
    test=xr.open_dataset(f'/home/liuxiang/10yeardata/test/CDF/{i}.nc')
    test= test.to_dataframe()
    test.to_csv(f'/home/liuxiang/10yeardata/test/CSV/{i}.csv')

*Process the attributes into variables* 

In [None]:
att = ['u10', 'v10', 'train_t2m', 'sp']
train_data= np.empty((1464,69,233,4))
train_t2m= np.empty((1464,69,233,1))
for i in range (2012,2022):
    train= pd.read_csv(f'/home/liuxiang/10yeardata/training/CSV/{i}.csv')
    u10= train['u10']
    v10= train['v10']
    t2m= train['t2m']
    sp= train['sp']
    if i%4==0: 
        u10= np.resize(u10, (1464,69,233,1))
        v10= np.resize(v10, (1464,69,233,1))
        t2m= np.resize(t2m, (1464,69,233,1))
        sp= np.resize(sp, (1464,69,233,1))
        train_i = np.concatenate((u10,v10,t2m,sp), axis=-1)
    else: 
        u10= np.resize(u10, (1460,69,233,1))
        v10= np.resize(v10, (1460,69,233,1))
        t2m= np.resize(t2m, (1460,69,233,1))
        sp= np.resize(sp, (1460,69,233,1))
        train_i = np.concatenate((u10,v10,t2m,sp), axis=-1)
    print(f'train_i size:{train_i.shape}')
    print(f'train_data size:{train_data.shape}')
    if i==2012: 
        train_data[:,:,:,:]= train_i 
        train_t2m[:,:,:,:]= t2m 
    else: 
        train_data= np.concatenate((train_data,train_i), axis=0)
        train_t2m= np.concatenate((train_t2m,t2m), axis=0)

In [22]:
test_data= np.empty((1460,69,233,4))
test_t2m= np.empty((1460,69,233,1))
for i in range (2022,2024):
    test= pd.read_csv(f'/home/liuxiang/10yeardata/test/CSV/{i}.csv')
    u10= test['u10']
    v10= test['v10']
    t2m= test['t2m']
    sp= test['sp']
    if i%4==0: 
        u10= np.resize(u10, (1464,69,233,1))
        v10= np.resize(v10, (1464,69,233,1))
        t2m= np.resize(t2m, (1464,69,233,1))
        sp= np.resize(sp, (1464,69,233,1))
        test_i = np.concatenate((u10,v10,t2m,sp), axis=-1)
    else: 
        u10= np.resize(u10, (1460,69,233,1))
        v10= np.resize(v10, (1460,69,233,1))
        t2m= np.resize(t2m, (1460,69,233,1))
        sp= np.resize(sp, (1460,69,233,1))
        test_i = np.concatenate((u10,v10,t2m,sp), axis=-1)
    if i==2022: 
        test_data[:,:,:,:]= test_i 
        test_t2m[:,:,:,:]= t2m 
    else: 
        test_data= np.concatenate((test_data,test_i), axis=0)
        test_t2m= np.concatenate((test_t2m,t2m), axis=0)