This final notebook shows the process of mergin and appending new data to the existing dataset, in this case new data are values from 2021.

In [1]:
import sys

sys.path.append("../../src")  # relative path where the library is stored

In [2]:
import pandas as pd
import os

In [3]:
data_dir = os.path.join("X:/data/ee")
out_dir = os.path.join(data_dir, "output_21")

In [4]:
# pd.set_option('display.max_rows', None)

In [5]:
# pd.set_option('display.max_columns', None)

In [6]:
ndvi = pd.read_csv(
    os.path.join(out_dir, "ndvi.csv"), encoding="UTF-8"
)  # index_col=[0,1]
ndvi.set_index(["wb_adm0_na", "year"], inplace=True)
ndvi = ndvi.loc[ndvi.landsat_id != "LT04"].copy()
ndvi.loc[:, "landsat_id"] = pd.Categorical(
    ndvi["landsat_id"], categories=["LE07", "LT05", "LC08"], ordered=True
)
ndvi.sort_values(["wb_adm0_na", "year", "landsat_id"], inplace=True)
ndvi.reset_index(inplace=True)
ndvi.drop_duplicates(["wb_adm0_na", "year"], keep="first", inplace=True)
ndvi.set_index(["wb_adm0_na", "year"], inplace=True)
ndvi = ndvi.loc[:, ndvi.columns[["ndvi" in col for col in ndvi.columns]]].copy()

In [7]:
evi = pd.read_csv(os.path.join(out_dir, "evi.csv"), encoding="UTF-8")
evi.set_index(["wb_adm0_na", "year"], inplace=True)
evi = evi.loc[evi.landsat_id != "LT04"].copy()
evi.loc[:, "landsat_id"] = pd.Categorical(
    evi["landsat_id"], categories=["LE07", "LT05", "LC08"], ordered=True
)
evi.sort_values(["wb_adm0_na", "year", "landsat_id"], inplace=True)
evi.reset_index(inplace=True)
evi.drop_duplicates(["wb_adm0_na", "year"], keep="first", inplace=True)
evi.set_index(["wb_adm0_na", "year"], inplace=True)
evi = evi.loc[:, evi.columns[["evi" in col for col in evi.columns]]].copy()

In [8]:
ndsi = pd.read_csv(os.path.join(out_dir, "ndsi.csv"), encoding="UTF-8")
ndsi.set_index(["wb_adm0_na", "year"], inplace=True)
ndsi = ndsi.loc[ndsi.landsat_id != "LT04"].copy()
ndsi.loc[:, "landsat_id"] = pd.Categorical(
    ndsi["landsat_id"], categories=["LE07", "LT05", "LC08"], ordered=True
)
ndsi.sort_values(["wb_adm0_na", "year", "landsat_id"], inplace=True)
ndsi.reset_index(inplace=True)
ndsi.drop_duplicates(["wb_adm0_na", "year"], keep="first", inplace=True)
ndsi.set_index(["wb_adm0_na", "year"], inplace=True)
ndsi = ndsi.loc[:, ndsi.columns[["ndsi" in col for col in ndsi.columns]]].copy()

In [9]:
ndwi = pd.read_csv(os.path.join(out_dir, "ndwi.csv"), encoding="UTF-8")
ndwi.set_index(["wb_adm0_na", "year"], inplace=True)
ndwi = ndwi.loc[ndwi.landsat_id != "LT04"].copy()
ndwi.loc[:, "landsat_id"] = pd.Categorical(
    ndwi["landsat_id"], categories=["LE07", "LT05", "LC08"], ordered=True
)
ndwi.sort_values(["wb_adm0_na", "year", "landsat_id"], inplace=True)
ndwi.reset_index(inplace=True)
ndwi.drop_duplicates(["wb_adm0_na", "year"], keep="first", inplace=True)
ndwi.set_index(["wb_adm0_na", "year"], inplace=True)
ndwi = ndwi.loc[:, ndwi.columns[["ndwi" in col for col in ndwi.columns]]].copy()

In [10]:
chirps = pd.read_csv(
    os.path.join(out_dir, "chirps.csv"), encoding="UTF-8"
)  # index_col=[0,1]
chirps.set_index(["wb_adm0_na", "year"], inplace=True)
chirps.sort_values(["wb_adm0_na", "year"], inplace=True)
chirps = chirps.loc[
    :, chirps.columns[["precipitation" in col for col in chirps.columns]]
].copy()

In [11]:
lst = pd.read_csv(os.path.join(out_dir, "temperature.csv"), encoding="UTF-8")
lst.set_index(["wb_adm0_na", "year"], inplace=True)
lst.sort_values(["wb_adm0_na", "year"], inplace=True)
lst = lst.loc[:, lst.columns[["temperature" in col for col in lst.columns]]].copy()

In [12]:
full_21 = (
    ndvi.join(evi, how="outer")
    .join(ndsi, how="outer")
    .join(ndwi, how="outer")
    .join(chirps, how="outer")
    .join(lst, how="outer")
)

All of the data is indexed by country name, this final step re-merges some key country attributes that weren't preserved in the earlier steps (iso code etc.)

In [14]:
full_21.sort_values(["wb_adm0_na", "year"], inplace=True)

In [15]:
adm0 = pd.read_csv(os.path.join(data_dir, "Admin0_Polys_v3.csv"), encoding="UTF-8")

In [16]:
adm0 = adm0.loc[adm0.WDI == 1].copy()

In [17]:
adm0.drop(
    ["WDI", "Data", "FID_100", "latY", "longX", "Shape_Leng", "Shape_Area"],
    axis=1,
    inplace=True,
)

In [18]:
adm0.rename(str.lower, axis=1, inplace=True)

In [19]:
full_21.reset_index(inplace=True)

In [20]:
full_21.rename(columns={"wb_adm0_na": "wb_adm0_na2"}, inplace=True)

In [21]:
full_21_adm = full_21.merge(adm0, on="wb_adm0_na2")

In [23]:
full_21_adm.drop("wb_adm0_na2", inplace=True, axis=1)

In [24]:
valid_codes = adm0.wb_adm0_na.unique()

In [25]:
full_21_adm = full_21_adm.loc[full_21_adm.wb_adm0_na.isin(valid_codes)].copy()

In [43]:
cols = adm0.columns.append(full_21.columns)

In [45]:
cols = cols.drop("wb_adm0_na2")

In [46]:
full_21_adm = full_21_adm[cols[~cols.duplicated()]].copy()

In [47]:
full_21_adm = full_21_adm.sort_values(["wb_adm0_na", "year"])

In [26]:
len(full_21_adm.wb_adm0_na.unique())

214

In [27]:
full_21_adm = full_21_adm.sort_values(["wb_adm0_na", "year"])

In [49]:
current_data = pd.read_csv(
    os.path.join(out_dir, "Country Dataset June7.csv"), encoding="UTF-8", index_col=0
)

In [55]:
full_21_adm.set_index(["wb_adm0_na", "year"], inplace=True)
current_data.set_index(["wb_adm0_na", "year"], inplace=True)

In [74]:
update = current_data.append(full_21_adm)

In [77]:
update.drop_duplicates(keep="last", inplace=True)

In [80]:
update.sort_values(["wb_adm0_na", "year"], inplace=True)