In [286]:

import pandas as pd
import datetime

prec = pd.read_csv("Daily Precipitation.csv")
soil = pd.read_csv("Daily Soil Moisture.csv")
ed = pd.read_csv("Eight Day NDVI.csv")
prodq = pd.read_csv("Production Quantity.csv")
preds = pd.read_csv("predicted_production_qty.csv")

In [287]:
def clean_date(*data, date_col=["start_date"]):
    """This function cleans and parses date columns to the datetime type"""
    for datum in data:
        # Drop column "end_date" since it does not match with production quantity dataset
        datum.drop(columns="end_date", inplace=True)
        for col in date_col:
            # Keep the first 8 charcters of col value
            datum[col] = datum[col].str.replace("-", "").str[:8]
            # Parse col to datetime type
            datum[col] = pd.to_datetime(datum[col], format="%Y%m%d")


clean_date(prec, soil, ed, prodq)


In ed, the ndvi score is measured every 8 days. Thus, I will add more dates to fill in the gap between the start_date and end_date (7 more dates) and adjust the score in the dates by a linear difference between the previous and the current date by the formula 
$$
ndvi_{t+1} = ndvi_{t} + \frac{ndvi_{t+8} - ndvi_{t}} {t + 8 - t} * (t + 1 - t)
$$
<br>
For example, the ndvi on start_date of 2014-01-01 and 2014-01-09 are 0.701431 and 0.745149. Then, the ndvi on the start_date of 2014-01-04 is<br>
0.701431 + (0.745149 - 0.701431) / 8 * 3<br>
This method is inspired by back propagation in Deep Learning



ed below is a dataframe object that has all start date from the oldest to latest, of which ndvi values are imputed by the formula explained above

In [288]:
# def unique_date(df, date_col):
#     """This function returns a list of dates that were between newest and oldest dates in date_col values"""
#     dates = pd.date_range(start=min(df[date_col]), end=max(df[date_col])).to_list()
#     for date in dates:
#         if date in df[date_col].values:
#             dates.remove(date)
#     return dates


# def closest_date(date, df, date_col):
#     """This function returns the index of the closest date in df[date_col] that was before the date"""
#     index = df[date_col].searchsorted(date)
#     return index


# def impute_ndvi(date, df, index, date_col):
#     """This function applies the formula above. Also, it returns the value of region_id"""
#     pre_nvdi_value = df.iloc[index, -2]
#     if index + 1 < len(df):
#         post_nvdi_value = df.iloc[index + 1, -2]
#     else:
#         post_nvdi_value = pre_nvdi_value
#     # Time difference between t and t-1 in terms of days
#     time_delta = (df.loc[index, date_col] - date).days
#     result = pre_nvdi_value + (post_nvdi_value - pre_nvdi_value) * time_delta / 8
#     return result, df.loc[index, "region_id"]


# ed_dates = unique_date(ed, "start_date")
# new = pd.DataFrame()
# for date in ed_dates:
#     index = closest_date(date, ed, "start_date")
#     new = new.append(
#         {
#             "start_date": date,
#             "ndvi": impute_ndvi(date, ed, index, "start_date")[0],
#             "region_id": impute_ndvi(date, ed, index, "start_date")[1],
#         },
#         ignore_index=True,
#     )
# ed = pd.concat([ed, new], join="inner")
# ed.to_csv('ed.csv', index=False)
ed = pd.read_csv('ed.csv', parse_dates=['start_date'])

In [289]:
# Merge prec, soil, and ed to columns that have the same values of start_date and region_id
merged_data = prec.merge(soil).merge(ed)
merged_data.set_index('start_date', inplace=True)

The grouped_merged dataset has a row representing a whole month. Thus, I take the average of all days in a month of precip, smos, and ndvi

In [290]:
# Group the merged data by start_date year, start_date month, and region_id
grouped_merged = merged_data.groupby([pd.Grouper(freq='MS'), 'region_id']).mean()
# I grouped this is to prepare to merge with production quantity since each start_date row represents a whole month
grouped_merged.reset_index(inplace=True)
train = grouped_merged.merge(prodq).sort_values('region_id').reset_index()

In [291]:
train.head()

Unnamed: 0,index,start_date,region_id,precip,smos,ndvi,prod
0,0,2015-01-01,93,9.595679,0.378895,0.761228,171725
336,336,2017-11-01,93,9.640686,0.296313,0.821147,326891
346,346,2017-12-01,93,2.706951,0.35653,0.81391,321723
80,80,2015-09-01,93,2.129571,0.233892,0.617915,299337
356,356,2018-01-01,93,1.548852,0.467739,0.793595,265944


In [292]:
preds

Unnamed: 0,start_date,end_date,prod,region_id
0,2021-01-01T00:00:00.000Z,2021-01-31T00:00:00.000Z,,93
1,2021-02-01T00:00:00.000Z,2021-02-28T00:00:00.000Z,,93
2,2021-03-01T00:00:00.000Z,2021-03-31T00:00:00.000Z,,93
3,2021-04-01T00:00:00.000Z,2021-04-30T00:00:00.000Z,,93
4,2021-05-01T00:00:00.000Z,2021-05-31T00:00:00.000Z,,93
...,...,...,...,...
115,2021-08-01T00:00:00.000Z,2021-08-31T00:00:00.000Z,,105
116,2021-09-01T00:00:00.000Z,2021-09-30T00:00:00.000Z,,105
117,2021-10-01T00:00:00.000Z,2021-10-31T00:00:00.000Z,,105
118,2021-11-01T00:00:00.000Z,2021-11-30T00:00:00.000Z,,105
