In [None]:
from pathlib import Path
import requests
from PIL import Image
from io import BytesIO

import pandas as pd
import numpy as np

# Predicting droughts and weather from weather data
The International Crops Research Institute for the Semi-Arid Tropics (ICRISAT) is an international organization which conducts agricultural research for rural development, headquartered in Hyderabad, Telangana, India.

Founded in 1972 by a consortium of organisations convened by the Ford and the Rockefeller foundations, ICRISAT’s charter was signed by The Food and Agriculture Organization of the United Nations (FAO) and The United Nations Development Programme (UNDP).

ICRISAT has been collecting detailed daily weather data in Hyderabad since 1978, and published a 40-year dataset of daily weather readings. I will use this dataset as the first dataset for my analysis.

## Datasets
1. ICRISAT daily weather dataset for 1978-2018, collected in Hyderabad.
1. Integrated drought index data monthly dataset for 1951-2016, of Hyderabad.
1. Qualitative indication of drought years for 1901-2009, of Telangana state, India.

## Questions
1. Which, if any, of the weather features collected by ICRISAT are related to each other?
1. Which years had the highest average and median rainfall, humidity and radiation? Which periods had the lowest?
1. Analyse the relationship between FAO56 ET (proxying crop yield) and other weather features.
1. Can any of the weather features collected indicate or predict drought?


Firstly, I will write a function to load data from the Vocareum folder into a pandas DataFrame.

The `load_data()` function also prints `DataFrame.info()`, `DataFrame.describe()` and `DataFrame.head()` to provide a sense of what the dataset contains.

In [None]:
def load_data(file: Path, **kwargs):
    '''
    Wrapper function to
        1. Loads data into a DataFrame, and
        2. Prints df.info() and df.describe().
    :param file: Path of file location
    :param kwargs: kwargs to pass to pd.read_csv() or pd.read_excel()
    :return: pd.DataFrame
    '''
    # reads data into df then and prints description and info
    if 'xls' in file.suffix:
        df = pd.read_excel(file, **kwargs)
    elif 'csv' in file.suffix:
        df = pd.read_csv(file, **kwargs)
    else:
        print(file.suffix)
        raise NotImplementedError

    print(f"===== df.info output =====")
    df.info()
    print(f"\n\n===== df.describe output =====\n", df.describe())
    print("\n\n===== df.head output =====\n", df.head())
    return df

## Loading ICRISAT weather data

In [None]:
cwd = Path.cwd()  # reads current working directory to simplify working with files
weather_df = load_data(cwd / "data" / "ICRISAT Weather 1978 to 2018.xlsx")

## ICRISAT weather fields
The ICRISAT data includes the following fields:

| #  | Column Name        | Description                                   |
|--- |:-------------------|:----------------------------------------------|
| 0  | Station            | Single value "ICRISAT"                        |
| 1  | Date               | Date of collection of weather data            |
| 2  | MaxT               | Maximum temperature (°C)                      |
| 3  | MinT               | Minimum temperature (°C)                      |
| 4  | RH1                | Relative humidity in the morning (%)          |
| 5  | RH2                | Relative humidity in the afternoon (%)        |
| 6  | Wind               | Wind (km/h)                                   |
| 7  | Rain               | Rain (mm)                                     |
| 8  | SSH                | Bright sunshine (hour)                        |
| 9  | Evap               | Evaporation (mm)                              |
| 10 | Radiation          | Radiation (mm/hour)                           |
| 11 | FAO56<sub>ET</sub> | FAO 56 reference crop evapotranspiration (mm) |
| 12 | Lat                | Latitude of collection point                  |
| 13 | Lon                | Longitude of collection point                 |
| 14 | Cum<sub>Rain</sub> | Cumulative monthly rainfall (mm)              |


In [None]:
drought_df = load_data(cwd / "data" / "data_17.625_78.375.csv", header=None,
                       delim_whitespace=True, names=['year', 'month', 'idi'])

In [None]:
ddf = (drought_df
       .assign(change_sign=lambda x: np.sign(x.idi).diff().ne(0),
               is_pos=lambda x: x.idi.gt(0),
               ds=lambda x: x.idi.gt(0).astype(int).replace(0, -1),
               days=lambda x: abs(x.groupby((x.ds != x.ds.shift()).cumsum()).cumsum().loc[:, 'ds'])
               )
       .drop(columns=['ds', 'change_sign'], axis=1)
       )

In [None]:
ddf