In [1]:
import pandas as pd

prec = pd.read_csv("Daily Precipitation.csv")
soil = pd.read_csv("Daily Soil Moisture.csv")
ed = pd.read_csv("Eight Day NDVI.csv")
prodq = pd.read_csv("Production Quantity.csv")
test = pd.read_csv("predicted_production_qty.csv")


In [2]:
def clean_date(*data, date_col=["start_date"]):
    """This function cleans and parses date columns to the datetime type"""
    for datum in data:
        # Drop column "end_date" since it does not match with production quantity dataset
        datum.drop(columns="end_date", inplace=True)
        for col in date_col:
            # Keep the first 8 charcters of col value
            datum[col] = datum[col].str.replace("-", "").str[:8]
            # Parse col to datetime type
            datum[col] = pd.to_datetime(datum[col], format="%Y%m%d")


clean_date(prec, soil, ed, prodq)

In ed, the ndvi score is measured every 8 days. Thus, I will add more dates to fill in the gap between the start_date and end_date (7 more dates) and adjust the score in the dates by a linear difference between the previous and the current date by the formula 
$$
ndvi_{t+1} = ndvi_{t} + \frac{ndvi_{t+8} - ndvi_{t}} {t + 8 - t} * (t + 1 - t)
$$
<br>
For example, the ndvi on start_date of 2014-01-01 and 2014-01-09 are 0.701431 and 0.745149. Then, the ndvi on the start_date of 2014-01-04 is<br>
0.701431 + (0.745149 - 0.701431) / 8 * 3<br>
This method is inspired by back propagation in Deep Learning



ed below is a dataframe object that has all start date from the oldest to latest, of which ndvi values are imputed by the formula explained above

In [3]:
# def unique_date(df, date_col):
#     """This function returns a list of dates that were between newest and oldest dates in date_col values"""
#     dates = pd.date_range(start=min(df[date_col]), end=max(df[date_col])).to_list()
#     for date in dates:
#         if date in df[date_col].values:
#             dates.remove(date)
#     return dates


# def closest_date(date, df, date_col):
#     """This function returns the index of the closest date in df[date_col] that was before the date"""
#     index = df[date_col].searchsorted(date)
#     return index


# def impute_ndvi(date, df, index, date_col):
#     """This function applies the formula above. Also, it returns the value of region_id"""
#     pre_nvdi_value = df.iloc[index, -2]
#     if index + 1 < len(df):
#         post_nvdi_value = df.iloc[index + 1, -2]
#     else:
#         post_nvdi_value = pre_nvdi_value
#     # Time difference between t and t-1 in terms of days
#     time_delta = (df.loc[index, date_col] - date).days
#     result = pre_nvdi_value + (post_nvdi_value - pre_nvdi_value) * time_delta / 8
#     return result, df.loc[index, "region_id"]


# ed_dates = unique_date(ed, "start_date")
# new = pd.DataFrame()
# # Add rows to the new DataFrame of which dates ed does not have
# for date in ed_dates:
#     index = closest_date(date, ed, "start_date")
#     new = new.append(
#         {
#             "start_date": date,
#             "ndvi": impute_ndvi(date, ed, index, "start_date")[0],
#             "region_id": impute_ndvi(date, ed, index, "start_date")[1],
#         },
#         ignore_index=True,
#     )

# # Fulfill dates in ed with ndvi being the imputed values
# ed = pd.concat([ed, new], join="inner")
# ed.to_csv('ed.csv', index=False)
ed = pd.read_csv('ed.csv', parse_dates=['start_date'])

In [4]:
# Merge prec, soil, and ed to columns that have the same values of start_date and region_id
merged_data = prec.merge(soil).merge(ed)
merged_data.set_index('start_date', inplace=True)

The grouped_merged dataset has a row representing a whole month. Thus, I take the average of all days in a month of precip, smos, and ndvi

In [5]:
# Group the merged data by start_date year, start_date month, and region_id
grouped_merged = merged_data.groupby([pd.Grouper(freq='MS'), 'region_id']).mean()
# I grouped this is to prepare to merge with production quantity since each start_date row represents a whole month
grouped_merged.reset_index(inplace=True)
train = grouped_merged.merge(prodq)

In [6]:
# Since the crop is seasonal, month but year plays more important role in predicting, I convert start_date to month name
def date_to_month(df, date_col):
    df['month'] = df[date_col].dt.month_name()
    df.drop(columns=[date_col], inplace=True)

# One hot month and region_id
def oh(df):
    df = pd.get_dummies(df, drop_first=True, columns=['region_id', 'month'])
    return df

date_to_month(train, 'start_date')
original_dates = test.iloc[:, :2].copy()
clean_date(test)
# Since this is a prediction for 12 months of 2021 in different regions, I clean the start_date column to month
date_to_month(test, 'start_date')

In [7]:
# The predicted values are the average of the same months, same region in all years
predicted = train.groupby(['month', 'region_id'])['prod'].mean()
predicted = predicted.to_frame().reset_index()

In [8]:
def to_date(df):
    '''This function converts the month column back to the cleaned format'''
    df['year'] = 2021
    df['start_date'] = pd.to_datetime(df['year'].astype(str)  + df['month'], format='%Y%B')
    df.drop(columns=['month', 'year'], inplace=True)

to_date(predicted)
to_date(test)
test.drop(columns=['prod'], inplace=True)
predicted = predicted.merge(test)

In [9]:
# Sort the predicted values by region, followed by date
predicted = predicted.sort_values(['region_id', 'start_date']).drop(columns=['start_date'])
# Convert the date to the original format
predicted = pd.concat([predicted, original_dates], axis=1)
# Reorder columns
predicted = predicted[['start_date', 'end_date', 'prod', 'region_id']]

In [11]:
predicted.to_csv('dtnghia1987@gmail.com.csv', index=False)