In [None]:
from pathlib import Path
import requests
from PIL import Image
from io import BytesIO

import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sn
import matplotlib.pyplot as plt

# Predicting droughts and weather from weather data
The International Crops Research Institute for the Semi-Arid Tropics (ICRISAT) is an international organization which conducts agricultural research for rural development, headquartered in Hyderabad, Telangana, India.

Founded in 1972 by a consortium of organisations convened by the Ford and the Rockefeller foundations, ICRISAT’s charter was signed by The Food and Agriculture Organization of the United Nations (FAO) and The United Nations Development Programme (UNDP).

ICRISAT has been collecting detailed daily weather data in Hyderabad since 1978, and published a 40-year dataset of daily weather readings. I will use this dataset as the first dataset for my analysis.

## Datasets
1. ICRISAT daily weather dataset for 1978-2018, collected in Hyderabad.
1. Integrated drought index data monthly dataset for 1951-2016, of Hyderabad.
1. Qualitative indication of drought years for 1901-2009, of Telangana state, India.

## Questions
1. Which, if any, of the weather features collected by ICRISAT appear to be correlated with each other?
1. Which months had the highest mean and median rainfall, humidity and radiation? Which months had the lowest?
1. Analyse the relationship between FAO56 ET (proxying crop yield) and other weather features.
1. Can any of the weather features collected indicate or predict drought?


Firstly, I will write a function to load data from the Vocareum folder into a pandas DataFrame.

The `load_data()` function also prints `DataFrame.info()`, `DataFrame.describe()` and `DataFrame.head()` to provide a sense of what the dataset contains.

In [None]:
def load_data(file: Path, **kwargs):
    '''
    Wrapper function to
        1. Loads data into a DataFrame, and
        2. Prints df.info() and df.describe().
    :param file: Path of file location
    :param kwargs: kwargs to pass to pd.read_csv() or pd.read_excel()
    :return: pd.DataFrame
    '''
    # reads data into df then and prints description and info
    if 'xls' in file.suffix:
        df = pd.read_excel(file, **kwargs)
    elif 'csv' in file.suffix:
        df = pd.read_csv(file, **kwargs)
    else:
        print(file.suffix)
        raise NotImplementedError

    print(f"===== df.info output =====")
    df.info()
    print(f"\n\n===== df.describe output =====\n", df.describe())
    print("\n\n===== df.head output =====\n", df.head())
    return df

## Dataset: ICRISAT weather observations

In [None]:
cwd = Path.cwd()  # reads current working directory to simplify working with files
weather_df = load_data(cwd / "data" / "ICRISAT Weather 1978 to 2018.xlsx")

### Dataset: ICRISAT - fields
The ICRISAT data is at the day level and includes the following fields:

| #   | Column Name        | Description                                   |
|-----|:-------------------|:----------------------------------------------|
| 0   | Station            | Single value "ICRISAT"                        |
| 1   | Date               | Date of collection of weather data            |
| 2   | MaxT               | Maximum temperature (°C)                      |
| 3   | MinT               | Minimum temperature (°C)                      |
| 4   | RH1                | Relative humidity in the morning (%)          |
| 5   | RH2                | Relative humidity in the afternoon (%)        |
| 6   | Wind               | Wind (km/h)                                   |
| 7   | Rain               | Rain (mm)                                     |
| 8   | SSH                | Bright sunshine (hour)                        |
| 9   | Evap               | Evaporation (mm)                              |
| 10  | Radiation          | Radiation (mm/hour)                           |
| 11  | FAO56<sub>ET</sub> | FAO 56 reference crop evapotranspiration (mm) |
| 12  | Lat                | Latitude of collection point                  |
| 13  | Lon                | Longitude of collection point                 |
| 14  | Cum<sub>Rain</sub> | Cumulative monthly rainfall (mm)              |


### Dataset: ICRISAT - Preview of data

In [None]:
weather_df.sample(10, random_state=42).sort_index()

### Dataset: ICRISAT - location of station
The coordinates of the ICRISAT research station are at:

In [None]:
coordinates = (weather_df.loc[0, 'Lat'], weather_df.loc[0, 'Lon'])
coordinates

Let's visualise this location on a map:

In [None]:
px.scatter_mapbox(weather_df, lat='Lat', lon='Lon', mapbox_style='open-street-map')

## Dataset: Integrated Drought Index (IDI)

Drought is a multifaceted weather phenomenon and can be characterised as meteorological, agricultural, hydrological, and groundwater droughts.

A drought index can be used to characterize drought, typically using gridded maps at regional and national levels. These indices help to characterise the different types of drought and quantify severity levels as well as onset and termination of drought.

The Integrated Drought Index (IDI) dataset is a drought index developed by Shah & Mishra (2020) that integrates the effects of these types of droughts into a single indicator.

Shah & Mishra (2020) has published monthly IDI from their study in 0.25° grids. The closest grid to the ICRISAT station was selected and read below into `drought_df`.

In [None]:
drought_df = load_data(cwd / "data" / "data_17.625_78.375.csv", header=None,
                       delim_whitespace=True, names=['year', 'month', 'idi'])

## Dataset: IDI - fields

The IDI data is at the month level and includes the following fields:

| #   | Column Name | Description                            |
|-----|:------------|:---------------------------------------|
| 0   | Year        | Year                                   |
| 1   | Month       | Month                                  |
| 2   | IDI         | Integrated Drought Index - index value |

## Dataset: IDI - Preview of data

In [None]:
drought_df.sample(10, random_state=42).sort_index()

# Question 1
>Which, if any, of the weather features collected by ICRISAT are related to each other?

To answer this, I will show a correlation matrix of the ICRISAT dataset.

This can be done directly in a pandas dataframe with `DataFrame.corr()`. To increase visual understanding, I will plot it with a `seaborn` heatmap.

In [None]:
sn.heatmap(weather_df.corr(), annot=True)
plt.show()

I will also query the correlation matrix directly to obtain descriptive statistics, and the top five correlated pairs of variables.

In [None]:
corr_matrix = (weather_df
               .corr()
               .stack()
               .reset_index()
               .rename(columns={'level_0': 'var1', 'level_1': 'var2', 0: 'value'})
               .query("var1 != var2 and var1 < var2")
               )

corr_matrix.describe()

In [None]:
(corr_matrix
 .sort_values(by='value', ascending=False)
 .head(5)
 .reset_index(drop=True)
 )

## Question 1 - conclusion

Most variables of the ICRISAT dataset are not highly correlated with each other.

The top 5 pairs of variables that exhibit correlation are:
1. Evaporation (mm) and FAO 56 reference crop evapotranspiration (mm)
2. Evaporation (mm) and Maximum temperature (°C)
3. FAO 56 reference crop evapotranspiration (mm) and Maximum temperature (°C)
4. FAO 56 reference crop evapotranspiration (mm) and Radiation (mm/hour)
5. Radiation (mm/hour) and Bright sunshine (hour)

# Question 2
Which months had the highest average and median rainfall, humidity and radiation? Which months had the lowest?

In [None]:
weather_df_monthly_mean = (weather_df
                           .resample('M', on='Date').mean()
                           .reset_index()
                           .rename(columns={'Date': 'Month'})
                           )

weather_df_monthly_median = (weather_df
                             .resample('M', on='Date').median()
                             .reset_index()
                             .rename(columns={'Date': 'Month'})
                             )

Firstly, create some convenience functions to print the highest and lowest values and months.

If multiple months exist with the same highest or lowest value, the first month will be printed.

In [None]:
def print_weather_stats(agg_type, weather_var):
    dict_agg_type = {'mean': weather_df_monthly_mean, 'median': weather_df_monthly_median}
    _a, _b = dict_agg_type.get(agg_type).loc[dict_agg_type.get(agg_type)[weather_var].idxmax(), ['Month', weather_var]]
    print(f"{_a:%B %Y} had the {agg_type} highest {weather_var} of {_b:.2f}.")
    _c, _d = dict_agg_type.get(agg_type).loc[dict_agg_type.get(agg_type)[weather_var].idxmin(), ['Month', weather_var]]
    print(f"{_c:%B %Y} had the {agg_type} lowest {weather_var} of {_d:.2f}.")
    return

## Rainfall

In [None]:
print_weather_stats('median', 'Rain')
print_weather_stats('mean', 'Rain')

## Humidity

In [None]:
print_weather_stats('median', 'RH1')
print_weather_stats('mean', 'RH1')

## Radiation

In [None]:
print_weather_stats('median', 'Radiation')
print_weather_stats('mean', 'Radiation')

In [None]:
drought_df_daily = (drought_df
                    .assign(change_sign=lambda x: np.sign(x.idi).diff().ne(0),
                            is_pos=lambda x: x.idi.gt(0),
                            ds=lambda x: x.idi.gt(0).astype(int).replace(0, -1),
                            days=lambda x: abs(x.groupby((x.ds != x.ds.shift()).cumsum()).cumsum().loc[:, 'ds'])
                            )
                    .drop(columns=['ds', 'change_sign'], axis=1)
                    )