# Import

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score

from statsmodels.regression.linear_model import OLS
import statsmodels.api as sm

df = pd.read_csv("data/food_crises_interpol.csv") # Read data into DataFrame
news_df = pd.read_csv("data/articles_topics_positivity.csv") # Read news data into DataFrame

# Processing Data

In [3]:
df["date"] = pd.to_datetime(df["year_month"], format="%Y-%m") # Create date column
print(len(df['district'].unique()))
df.set_index(["date", "county"], inplace=True) # Set index

77


In [5]:
news_df["date"] = pd.to_datetime(
    pd.to_datetime(news_df["date"], format="%Y-%m-%d").dt.strftime("%Y-%m"),
    format="%Y-%m",
)

In [6]:
def create_news_features(columns):
    cols = []
    for column in columns:
        col = news_df.groupby(["date", "county"])[column].mean()
        col = col.fillna(0)
        # col = col.rolling(3).mean()
        # col = col.shift(3)
        cols.append(col)
    return pd.concat(cols, axis=1)

In [7]:
news_features = create_news_features(['hunger_positive', 'hunger_negative', 'refugees_positive', 'refugees_negative', 'humanitarian_positive', 'humanitarian_negative', 'conflict_positive', 'conflict_negative', 'vulnerability_positive', 'vulnerability_negative'])

# Merge

In [8]:
df.sort_index(level=0, inplace=True) # Sort DataFrame by date

In [9]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,district,region,ipc,year_month,ndvi_mean,ndvi_anom,rain_mean,rain_anom,et_mean,...,sum_fatalities,food_price_idx,area,cropland_pct,pop,ruggedness_mean,pasture_pct,prev_ipc,2prev_ipc,next_ipc
date,county,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2009-11-01,Abiemnhom,7940,Abiemnhom,Unity,2.0,2009-11-01,0.452458,97.17758,1.085453,0.415071,1.202695,...,0,0.943680,2408.233,1.742261,20702.00,11004.640,91.48276,2.0,2.0,2.0
2009-11-01,Akobo,5124,Akobo,Jonglei,3.0,2009-11-01,0.450313,79.39274,1.285720,-2.052032,5.713924,...,0,1.049001,9027.626,10.046920,158222.00,2787.772,89.71963,4.0,4.0,3.0
2009-11-01,Aweil Centre,6916,Aweil Center,Northern Bahr el Ghazal,1.0,2009-11-01,0.494648,101.18030,5.601856,3.108650,2.968859,...,0,0.908136,11202.960,0.173833,48376.00,3754.860,81.76336,2.0,2.0,1.0
2009-11-01,Aweil East,8580,Aweil East,Northern Bahr el Ghazal,2.0,2009-11-01,0.418445,97.18519,1.084202,0.599545,1.310700,...,0,0.932502,6400.981,26.804220,345977.00,3544.279,90.16216,3.0,3.0,1.0
2009-11-01,Aweil North,260,Aweil North,Northern Bahr el Ghazal,1.0,2009-11-01,0.422307,98.88503,1.596338,0.852844,1.387875,...,0,0.875988,6369.012,22.906570,146204.00,4808.697,93.45946,2.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-01,Wulu,4094,Wulu,Lakes,2.0,2020-01-01,0.390809,115.31650,0.216196,-0.099790,1.810135,...,1,80.615710,11894.780,0.032678,45158.50,40366.550,65.10490,2.0,3.0,2.0
2020-01-01,Yambio,1150,Yambio,Western Equatoria,3.0,2020-01-01,0.480063,113.04510,0.609318,-0.765759,6.120980,...,1,74.486710,8896.165,19.126490,197278.90,60628.160,60.88350,2.0,3.0,3.0
2020-01-01,Yei,1022,Yei,Central Equatoria,3.0,2020-01-01,0.526050,124.30660,1.210738,-0.921322,11.679240,...,1,90.145640,6713.251,16.303490,288875.40,75813.310,62.85185,3.0,3.0,3.0
2020-01-01,Yirol East,4350,Yirol East,Lakes,3.0,2020-01-01,0.466752,115.83420,0.193205,-0.015236,3.880718,...,0,99.849660,5599.893,2.482445,73693.13,18421.020,91.75000,3.0,4.0,4.0


In [10]:
df = df.join(news_features, how="left") # Join df with created news features

In [11]:
# Replace the NaN values in specific columns with 0
columns_to_fill = ['hunger_positive', 'hunger_negative', 'refugees_positive', 'refugees_negative', 'conflict_positive', 'conflict_negative', 'humanitarian_positive', 'humanitarian_negative', 'vulnerability_positive', 'vulnerability_negative']
df[columns_to_fill] = df[columns_to_fill].fillna(0)


In [12]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,district,region,ipc,year_month,ndvi_mean,ndvi_anom,rain_mean,rain_anom,et_mean,...,hunger_positive,hunger_negative,refugees_positive,refugees_negative,humanitarian_positive,humanitarian_negative,conflict_positive,conflict_negative,vulnerability_positive,vulnerability_negative
date,county,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2009-11-01,Abiemnhom,7940,Abiemnhom,Unity,2.0,2009-11-01,0.452458,97.17758,1.085453,0.415071,1.202695,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-11-01,Akobo,5124,Akobo,Jonglei,3.0,2009-11-01,0.450313,79.39274,1.285720,-2.052032,5.713924,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-11-01,Aweil Centre,6916,Aweil Center,Northern Bahr el Ghazal,1.0,2009-11-01,0.494648,101.18030,5.601856,3.108650,2.968859,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-11-01,Aweil East,8580,Aweil East,Northern Bahr el Ghazal,2.0,2009-11-01,0.418445,97.18519,1.084202,0.599545,1.310700,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-11-01,Aweil North,260,Aweil North,Northern Bahr el Ghazal,1.0,2009-11-01,0.422307,98.88503,1.596338,0.852844,1.387875,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-01,Wulu,4094,Wulu,Lakes,2.0,2020-01-01,0.390809,115.31650,0.216196,-0.099790,1.810135,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-01,Yambio,1150,Yambio,Western Equatoria,3.0,2020-01-01,0.480063,113.04510,0.609318,-0.765759,6.120980,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-01,Yei,1022,Yei,Central Equatoria,3.0,2020-01-01,0.526050,124.30660,1.210738,-0.921322,11.679240,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-01,Yirol East,4350,Yirol East,Lakes,3.0,2020-01-01,0.466752,115.83420,0.193205,-0.015236,3.880718,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df.to_csv("data/food_crises_interpol_news.csv")