In [1]:
import pandas as pd
file_name = 'data/qa_queries_V28.parquet'
df = pd.read_parquet(file_name)
df = df.sort_values(by=['lat', 'time'])
# slice off all rows where qa.sentinel2 == 1
s2 = df[df['qa.sentinel2'] == 1].copy()
# remove all columns except time, lat, ndvi.sentinel2
s2 = s2[['time', 'lat', 'ndvi.sentinel2']]
# rename ndvi.sentinel2 to ndvi
s2 = s2.rename(columns={'ndvi.sentinel2': 'ndvi'})
# remove any row where ndvi is the same as the previous row
s2 = s2[s2['ndvi'] != s2['ndvi'].shift(1)]

# slice off all rows where qa.landsat8 == 1
l8 = df[df['qa.landsat'] == 1].copy()
# remove all columns except time, lat, ndvi.landsat
l8 = l8[['time', 'lat', 'ndvi.landsat']]
# rename ndvi.landsat to ndvi
l8 = l8.rename(columns={'ndvi.landsat': 'ndvi'})
# remove any row where ndvi is the same as the previous row
l8 = l8[l8['ndvi'] != l8['ndvi'].shift(1)]
# concat s2 and l8
m = pd.concat([s2, l8])
# sort by lat, time
m = m.sort_values(by=['lat', 'time'])
# reset index
m = m.reset_index(drop=True)

In [2]:
lats = df['lat'].unique()
# save lats to file
lats.tofile('data/lats.bin')
# lats = np.fromfile('data/lats.bin', dtype=np.float64)

In [5]:
import pandas as pd
import numpy as np
from scipy.signal import savgol_filter

def smooth(m):

    # Step 2: Create a new DataFrame 'm1' with all dates from the minimum to maximum date in 'm'
    min_date = m["time"].min()
    max_date = m["time"].max()
    date_range = pd.date_range(min_date, max_date, freq='D')  # Create a date range with daily frequency
    m1 = pd.DataFrame({"time": date_range})

    # Step 3: Merge the original 'ndvi' values into the new DataFrame 'm1' using outer join
    m1 = pd.merge(m1, m, on="time", how="left")

    # Step 4: Interpolate missing values in the 'ndvi' column
    m1["ndvi"] = m1["ndvi"].interpolate()

    # Step 5: Use savgol_filter to smooth the "ndvi" column in 'm1'
    window_length = 20  # Adjust the window length as needed
    polyorder = 2  # Adjust the polynomial order as needed

    # Fill missing values with NaN, so the filter doesn't treat them as zeros
    m1["ndvi"] = m1["ndvi"].replace(0, np.nan)

    # Apply the savgol_filter to smooth the "ndvi" column
    m1["ndvi_smoothed"] = savgol_filter(m1["ndvi"], window_length, polyorder)

    return m1


In [18]:
complete = pd.DataFrame()
for l in lats:
    # slice off all rows where lat == l
    ndvi_raw = m[m['lat'] == l]
    # remove lat column
    ndvi_raw = ndvi_raw.drop(columns=['lat'])
    ndvi = smooth(ndvi_raw)
    # add a column for lat
    ndvi['lat'] = l
    # make it the first column
    ndvi = ndvi[['lat', 'time', 'ndvi', 'ndvi_smoothed']]
    # concat to complete
    complete = pd.concat([complete, ndvi])
    

# reindex complete
complete = complete.reindex(columns=['lat', 'time', 'ndvi', 'ndvi_smoothed'])
# if number of unique lats not equal to 22, then print error
if len(complete['lat'].unique()) != 22:
    print('ERROR: Number of unique lats not equal to 22')
# save complete to file
complete.to_parquet('data/training.parquet')
# load it back in
complete = pd.read_parquet('data/training.parquet')



    