In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
#import geopandas


In [4]:
# read csv file
#df = pd.read_csv(r'C:\Users\WeilunShi\OneDrive - CGIAR\Desktop\Food Crisis and Insecurity\comprehensive_merge.csv')
df = pd.read_csv(r'C:\Users\MALMANZAR\CGIAR\Shi, Weilun (IFPRI) - Food Crisis and Insecurity\comprehensive_merge.csv')
print("DF" ,df.shape)


DF (388085, 53)


In [5]:

#drop geometry column
df = df.drop(columns=['geometry'])
# drop observation with missing phase_worse_percentage_manual
df = df.dropna(subset=['phase3_worse_percentage_manual'])
# drop columns with 50% more missing values
df = df.dropna(thresh=len(df)*0.5, axis=1)
# drop columns with all missing values
df = df.dropna(axis=1, how='all')

# drop columns with all zero values
df = df.loc[:, (df != 0).any(axis=0)]

# Assuming 'df' is your dataset as a pandas DataFrame

# Select features and target variable
X = df[['estimated_population','event_count_battles', 'event_count_explosions', 'event_count_violence',
       'fatalities_battles', 'fatalities_explosions', 'fatalities_violence','nearest_neighbor_distance',
       'GOSIF_GPP', 'rainfall_chirps', 'GOSIF_GPP_SD', 
        'temperature_2m_mean', 'temperature_2m_mean_sd',
       'shortwave_radiation_sum', 'shortwave_radiation_sum_sd',
       'precipitation_sum', 'precipitation_sum_sd','price_index']].values.astype(np.float32)
y = df['phase3_plus_phase4'].values.astype(np.float32)

print("X" , X.shape)
print("y" ,y.shape)






X (315776, 18)
y (315776,)


In [None]:
X.head()


In [8]:


# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# impute X_train_scaled
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X_train_scaled)
X_train_scaled = imp.transform(X_train_scaled)



# impute X_test_scaled
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X_test_scaled)
X_test_scaled = imp.transform(X_test_scaled)


# Setting up the RF
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train_scaled, y_train)
print("Trained")





Trained


In [10]:
y_pred = rf_regressor.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)


Mean Squared Error: 0.0001288981881663356
Root Mean Squared Error: 0.011353333790844678
Mean Absolute Error: 0.0016723832267519678
R-squared: 0.9968214771859986


In [15]:
# Obtain feature importances from the Random Forest model
importances = rf_regressor.feature_importances_

# Get the feature names from your dataset
feature_names = ['estimated_population','event_count_battles', 'event_count_explosions', 'event_count_violence',
       'fatalities_battles', 'fatalities_explosions', 'fatalities_violence','nearest_neighbor_distance',
       'GOSIF_GPP', 'rainfall_chirps', 'GOSIF_GPP_SD', 
        'temperature_2m_mean', 'temperature_2m_mean_sd',
       'shortwave_radiation_sum', 'shortwave_radiation_sum_sd',
       'precipitation_sum', 'precipitation_sum_sd','price_index']

# Print the feature importances
for feature, importance in zip(feature_names, importances):
    print(f"{feature}: {importance}")

estimated_population: 0.1504907034117259
event_count_battles: 0.017185309987697524
event_count_explosions: 0.007049468881585583
event_count_violence: 0.012437698238063465
fatalities_battles: 0.015347084596220216
fatalities_explosions: 0.004626226550170163
fatalities_violence: 0.011241935044376998
nearest_neighbor_distance: 0.05797985285836072
GOSIF_GPP: 0.06998317266666618
rainfall_chirps: 0.04813349859589511
GOSIF_GPP_SD: 0.06975181894096083
temperature_2m_mean: 0.08976427111206167
temperature_2m_mean_sd: 0.17694545845063994
shortwave_radiation_sum: 0.041028453037707475
shortwave_radiation_sum_sd: 0.039202429011621534
precipitation_sum: 0.031772936570108094
precipitation_sum_sd: 0.03182062069957322
price_index: 0.12523906134656526


In [9]:
# save model
#import pickle
#pickle.dump(rf_regressor, open(r'C:\Users\WeilunShi\OneDrive - CGIAR\Desktop\Food Crisis and Insecurity\\3.Random_Forest_Training\rf_regressor.pkl', 'wb'))

# save scaler
#pickle.dump(scaler, open(r'C:\Users\WeilunShi\OneDrive - CGIAR\Desktop\Food Crisis and Insecurity\\3.Random_Forest_Training\scaler.pkl', 'wb'))

# save test set
#np.save(r'C:\Users\WeilunShi\OneDrive - CGIAR\Desktop\Food Crisis and Insecurity\\3.Random_Forest_Training\X_test_scaled.npy', X_test_scaled)

# save test set
#np.save(r'C:\Users\WeilunShi\OneDrive - CGIAR\Desktop\Food Crisis and Insecurity\\3.Random_Forest_Training\y_test.npy', y_test)


In [101]:
# read csv file
df = pd.read_csv(r'C:\Users\WeilunShi\OneDrive - CGIAR\Desktop\Food Crisis and Insecurity\comprehensive_merge.csv')
#drop geometry column
df = df.drop(columns=['geometry'])
# drop columns with all missing values
df = df.dropna(axis=1, how='all')

# drop columns with all zero values
df = df.loc[:, (df != 0).any(axis=0)]
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['area_id', 'date'])

import pandas as pd

df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['area_id', 'date'])

df.set_index('date', inplace=True)

# List of columns you want to calculate rolling average for
cols_to_avg = ['event_count_explosions', 'event_count_violence',
       'fatalities_battles', 'fatalities_explosions', 'fatalities_violence',
       'GOSIF_GPP', 'rainfall_chirps', 'GOSIF_GPP_SD', 'elevation', 'soil',
       'price', 'temperature_2m_mean', 'temperature_2m_mean_sd',
       'shortwave_radiation_sum', 'shortwave_radiation_sum_sd',
       'precipitation_sum', 'precipitation_sum_sd','nearest_neighbor_distance','price_index']

df_monthly = df.groupby('area_id')[cols_to_avg].resample('M').mean()

for col in cols_to_avg:
    df_monthly[col + '_12m_avg'] = df_monthly.groupby(level=0)[col].rolling(window=12).mean()

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\WeilunShi\OneDrive - CGIAR\Desktop\Food Crisis and Insecurity\.venv\Lib\site-packages\pandas\core\frame.py", line 11610, in _reindex_for_setitem
    reindexed_value = value.reindex(index)._values
                      ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\WeilunShi\OneDrive - CGIAR\Desktop\Food Crisis and Insecurity\.venv\Lib\site-packages\pandas\core\series.py", line 4919, in reindex
    return super().reindex(
           ^^^^^^^^^^^^^^^^
  File "c:\Users\WeilunShi\OneDrive - CGIAR\Desktop\Food Crisis and Insecurity\.venv\Lib\site-packages\pandas\core\generic.py", line 5360, in reindex
    return self._reindex_axes(
           ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\WeilunShi\OneDrive - CGIAR\Desktop\Food Crisis and Insecurity\.venv\Lib\site-packages\pandas\core\generic.py", line 5375, in _reindex_axes
    new_index, indexer = ax.reindex(
                         ^^^^^^^^^^^
  File "c:\Users\WeilunShi\OneDrive - CGIAR\Desktop\Food Crisi

In [102]:
df_monthly.reset_index(inplace=True)
df.reset_index(inplace=True)

In [103]:
# convert date to datetime
df_monthly['date'] = pd.to_datetime(df_monthly['date'])
df['date'] = pd.to_datetime(df['date'])
# change date to the first day of the month
df_monthly['date'] = df_monthly['date'].dt.strftime('%Y-%m-01')

# for each column other than area_id and date in df_monthly, rename the column to column + '_12m_avg'
for col in cols_to_avg:
    df_monthly[col+'_12_avg'] = df_monthly[col]
    df_monthly = df_monthly.drop(columns=[col])


# convert date to datetime
df_monthly['date'] = pd.to_datetime(df_monthly['date'])
df['date'] = pd.to_datetime(df['date'])
# merge df and df_monthly
df_new = pd.merge(df, df_monthly, how='left', on=['area_id', 'date'])
# drop original columns and rename the columns with _12_avg to the original column name

for col in cols_to_avg:
    df_new = df_new.drop(columns=[col])
    df_new = df_new.rename(columns={col+'_12_avg': col})
# drop observation with missing phase_worse_percentage_manual
df = df.dropna(subset=['phase3_worse_percentage_manual'])
# drop columns with 50% more missing values
df = df.dropna(thresh=len(df)*0.5, axis=1)
# drop columns with all missing values
df = df.dropna(axis=1, how='all')

# drop columns with all zero values
df = df.loc[:, (df != 0).any(axis=0)]

# Assuming 'df' is your dataset as a pandas DataFrame

# Select features and target variable
X = df[['estimated_population','event_count_battles', 'event_count_explosions', 'event_count_violence',
       'fatalities_battles', 'fatalities_explosions', 'fatalities_violence','nearest_neighbor_distance',
       'GOSIF_GPP', 'rainfall_chirps', 'GOSIF_GPP_SD', 
        'temperature_2m_mean', 'temperature_2m_mean_sd',
       'shortwave_radiation_sum', 'shortwave_radiation_sum_sd',
       'precipitation_sum', 'precipitation_sum_sd','price_index']].values.astype(np.float32)
y = df['phase3_plus_phase4'].values.astype(np.float32)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# impute X_train_scaled
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X_train_scaled)
X_train_scaled = imp.transform(X_train_scaled)
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train_scaled, y_train)


# impute X_test_scaled
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X_test_scaled)
X_test_scaled = imp.transform(X_test_scaled)

In [109]:
y_pred = rf_regressor.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

Mean Squared Error: 0.010643312537536807
Root Mean Squared Error: 0.10316643125327543
Mean Absolute Error: 0.0757753004367654
R-squared: 0.7159731425170588


In [110]:
# Obtain feature importances from the Random Forest model
importances = rf_regressor.feature_importances_

# Get the feature names from your dataset
feature_names = ['estimated_population','event_count_battles', 'event_count_explosions', 'event_count_violence',
       'fatalities_battles', 'fatalities_explosions', 'fatalities_violence','nearest_neighbor_distance',
       'GOSIF_GPP', 'rainfall_chirps', 'GOSIF_GPP_SD', 
        'temperature_2m_mean', 'temperature_2m_mean_sd',
       'shortwave_radiation_sum', 'shortwave_radiation_sum_sd',
       'precipitation_sum', 'precipitation_sum_sd','price_index']

# Print the feature importances
for feature, importance in zip(feature_names, importances):
    print(f"{feature}: {importance}")

estimated_population: 0.15812652127193444
event_count_battles: 0.015005508468965822
event_count_explosions: 0.006834930087063135
event_count_violence: 0.010777222751171923
fatalities_battles: 0.017019351967655173
fatalities_explosions: 0.004183655948107346
fatalities_violence: 0.012355635093489789
nearest_neighbor_distance: 0.05733329083273019
GOSIF_GPP: 0.07619310539468775
rainfall_chirps: 0.051886844902740724
GOSIF_GPP_SD: 0.06938937061676173
temperature_2m_mean: 0.08824400867424732
temperature_2m_mean_sd: 0.16582877873132315
shortwave_radiation_sum: 0.042810250440663306
shortwave_radiation_sum_sd: 0.03692621159195161
precipitation_sum: 0.03007692512843207
precipitation_sum_sd: 0.03184992167797163
price_index: 0.125158466420103


In [163]:
# read csv file
df = pd.read_csv(r'C:\Users\WeilunShi\OneDrive - CGIAR\Desktop\Food Crisis and Insecurity\comprehensive_merge.csv')
#drop geometry column
df = df.drop(columns=['geometry'])
# drop columns with all missing values
df = df.dropna(axis=1, how='all')

# drop columns with all zero values
df = df.loc[:, (df != 0).any(axis=0)]
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['area_id', 'date'])

import pandas as pd

df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['area_id', 'date'])

df.set_index('date', inplace=True)

# List of columns you want to calculate rolling average for
cols_to_avg = ['event_count_explosions', 'event_count_violence',
       'fatalities_battles', 'fatalities_explosions', 'fatalities_violence',
       'GOSIF_GPP', 'rainfall_chirps', 'GOSIF_GPP_SD', 'elevation', 'soil',
       'price', 'temperature_2m_mean', 'temperature_2m_mean_sd',
       'shortwave_radiation_sum', 'shortwave_radiation_sum_sd',
       'precipitation_sum', 'precipitation_sum_sd','nearest_neighbor_distance','price_index']

df_monthly = df.groupby('area_id')[cols_to_avg].resample('M').mean()

for col in cols_to_avg:
    df_monthly[col + '_12_avg'] = df_monthly.groupby('area_id')[col].rolling(window=6, min_periods=1).mean().reset_index(level=0, drop=True)


In [164]:
df_monthly.reset_index(inplace=True)
df.reset_index(inplace=True)

In [165]:
# convert date to datetime
df_monthly['date'] = pd.to_datetime(df_monthly['date'])
df['date'] = pd.to_datetime(df['date'])
# change date to the first day of the month
df_monthly['date'] = df_monthly['date'].dt.strftime('%Y-%m-01')


In [166]:

# for each column other than area_id and date in df_monthly, rename the column to column + '_12m_avg'
for col in cols_to_avg:
    df_monthly = df_monthly.drop(columns=[col])


# convert date to datetime
df_monthly['date'] = pd.to_datetime(df_monthly['date'])
df['date'] = pd.to_datetime(df['date'])
# merge df and df_monthly
df_new = pd.merge(df, df_monthly, how='left', on=['area_id', 'date'])
# drop original columns and rename the columns with _12_avg to the original column name

for col in cols_to_avg:
    df_new = df_new.drop(columns=[col])
    df_new = df_new.rename(columns={col+'_12_avg': col})
# drop observation with missing phase_worse_percentage_manual
df = df.dropna(subset=['phase3_worse_percentage_manual'])
# drop columns with 50% more missing values
df = df.dropna(thresh=len(df)*0.5, axis=1)
# drop columns with all missing values
df = df.dropna(axis=1, how='all')

# drop columns with all zero values
df = df.loc[:, (df != 0).any(axis=0)]

# Assuming 'df' is your dataset as a pandas DataFrame

# Select features and target variable
X = df[['estimated_population','event_count_battles', 'event_count_explosions', 'event_count_violence',
       'fatalities_battles', 'fatalities_explosions', 'fatalities_violence','nearest_neighbor_distance',
       'GOSIF_GPP', 'rainfall_chirps', 'GOSIF_GPP_SD', 
        'temperature_2m_mean', 'temperature_2m_mean_sd',
       'shortwave_radiation_sum', 'shortwave_radiation_sum_sd',
       'precipitation_sum', 'precipitation_sum_sd','price_index']].values.astype(np.float32)
y = df['phase3_plus_phase4'].values.astype(np.float32)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# impute X_train_scaled
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X_train_scaled)
X_train_scaled = imp.transform(X_train_scaled)
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train_scaled, y_train)


# impute X_test_scaled
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X_test_scaled)
X_test_scaled = imp.transform(X_test_scaled)

In [167]:
y_pred = rf_regressor.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

Mean Squared Error: 0.011379663593989568
Root Mean Squared Error: 0.10667550606390189
Mean Absolute Error: 0.07561310160731201
R-squared: 0.697040778495385


In [168]:
# Obtain feature importances from the Random Forest model
importances = rf_regressor.feature_importances_

# Get the feature names from your dataset
feature_names = ['estimated_population','event_count_battles', 'event_count_explosions', 'event_count_violence',
       'fatalities_battles', 'fatalities_explosions', 'fatalities_violence','nearest_neighbor_distance',
       'GOSIF_GPP', 'rainfall_chirps', 'GOSIF_GPP_SD', 
        'temperature_2m_mean', 'temperature_2m_mean_sd',
       'shortwave_radiation_sum', 'shortwave_radiation_sum_sd',
       'precipitation_sum', 'precipitation_sum_sd','price_index']

# Print the feature importances
for feature, importance in zip(feature_names, importances):
    print(f"{feature}: {importance}")

estimated_population: 0.16037584887623985
event_count_battles: 0.015292427669536656
event_count_explosions: 0.006868191610703205
event_count_violence: 0.010725680789604172
fatalities_battles: 0.015642792605600055
fatalities_explosions: 0.004386836611664744
fatalities_violence: 0.011628557570826894
nearest_neighbor_distance: 0.05567218937350375
GOSIF_GPP: 0.07335559392120494
rainfall_chirps: 0.051785111305077965
GOSIF_GPP_SD: 0.06653387437862394
temperature_2m_mean: 0.08814856741178646
temperature_2m_mean_sd: 0.17186510848778094
shortwave_radiation_sum: 0.03965378434487858
shortwave_radiation_sum_sd: 0.03797881056280124
precipitation_sum: 0.03033480184036122
precipitation_sum_sd: 0.033098183362711774
price_index: 0.1266536392770937


In [158]:
# read csv file
df = pd.read_csv(r'C:\Users\WeilunShi\OneDrive - CGIAR\Desktop\Food Crisis and Insecurity\comprehensive_merge.csv')
#drop geometry column
df = df.drop(columns=['geometry'])
# drop columns with all missing values
df = df.dropna(axis=1, how='all')

# drop columns with all zero values
df = df.loc[:, (df != 0).any(axis=0)]
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['area_id', 'date'])

import pandas as pd

df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['area_id', 'date'])

df.set_index('date', inplace=True)

# List of columns you want to calculate rolling average for
cols_to_avg = ['event_count_explosions', 'event_count_violence',
       'fatalities_battles', 'fatalities_explosions', 'fatalities_violence',
       'GOSIF_GPP', 'rainfall_chirps', 'GOSIF_GPP_SD', 'elevation', 'soil',
       'price', 'temperature_2m_mean', 'temperature_2m_mean_sd',
       'shortwave_radiation_sum', 'shortwave_radiation_sum_sd',
       'precipitation_sum', 'precipitation_sum_sd','nearest_neighbor_distance','price_index']

df_monthly = df.groupby('area_id')[cols_to_avg].resample('M').mean()

for col in cols_to_avg:
    df_monthly[col + '_12_avg'] = df_monthly.groupby('area_id')[col].rolling(window=12, min_periods=1).mean().reset_index(level=0, drop=True)

In [159]:
df_monthly.reset_index(inplace=True)
df.reset_index(inplace=True)

In [160]:
# convert date to datetime
df_monthly['date'] = pd.to_datetime(df_monthly['date'])
df['date'] = pd.to_datetime(df['date'])
# change date to the first day of the month
df_monthly['date'] = df_monthly['date'].dt.strftime('%Y-%m-01')


In [161]:
# for each column other than area_id and date in df_monthly, rename the column to column + '_12m_avg'
for col in cols_to_avg:
    df_monthly = df_monthly.drop(columns=[col])


# convert date to datetime
df_monthly['date'] = pd.to_datetime(df_monthly['date'])
df['date'] = pd.to_datetime(df['date'])
# merge df and df_monthly
df_new = pd.merge(df, df_monthly, how='left', on=['area_id', 'date'])
# drop original columns and rename the columns with _12_avg to the original column name

for col in cols_to_avg:
    df_new = df_new.drop(columns=[col])
    df_new = df_new.rename(columns={col+'_12_avg': col})
# drop observation with missing phase_worse_percentage_manual
df = df.dropna(subset=['phase3_worse_percentage_manual'])
# drop columns with 50% more missing values
df = df.dropna(thresh=len(df)*0.5, axis=1)
# drop columns with all missing values
df = df.dropna(axis=1, how='all')

# drop columns with all zero values
df = df.loc[:, (df != 0).any(axis=0)]

# Assuming 'df' is your dataset as a pandas DataFrame

# Select features and target variable
X = df[['estimated_population','event_count_battles', 'event_count_explosions', 'event_count_violence',
       'fatalities_battles', 'fatalities_explosions', 'fatalities_violence','nearest_neighbor_distance',
       'GOSIF_GPP', 'rainfall_chirps', 'GOSIF_GPP_SD', 
        'temperature_2m_mean', 'temperature_2m_mean_sd',
       'shortwave_radiation_sum', 'shortwave_radiation_sum_sd',
       'precipitation_sum', 'precipitation_sum_sd','price_index']].values.astype(np.float32)
y = df['phase3_plus_phase4'].values.astype(np.float32)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=53)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# impute X_train_scaled
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X_train_scaled)
X_train_scaled = imp.transform(X_train_scaled)
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train_scaled, y_train)


# impute X_test_scaled
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X_test_scaled)
X_test_scaled = imp.transform(X_test_scaled)

In [162]:
y_pred = rf_regressor.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

Mean Squared Error: 0.011187633965191186
Root Mean Squared Error: 0.10577161228416246
Mean Absolute Error: 0.07834548102890003
R-squared: 0.683700571038484


In [157]:
# Obtain feature importances from the Random Forest model
importances = rf_regressor.feature_importances_

# Get the feature names from your dataset
feature_names = ['estimated_population','event_count_battles', 'event_count_explosions', 'event_count_violence',
       'fatalities_battles', 'fatalities_explosions', 'fatalities_violence','nearest_neighbor_distance',
       'GOSIF_GPP', 'rainfall_chirps', 'GOSIF_GPP_SD', 
        'temperature_2m_mean', 'temperature_2m_mean_sd',
       'shortwave_radiation_sum', 'shortwave_radiation_sum_sd',
       'precipitation_sum', 'precipitation_sum_sd','price_index']

# Print the feature importances
for feature, importance in zip(feature_names, importances):
    print(f"{feature}: {importance}")

estimated_population: 0.16170885746534094
event_count_battles: 0.015539730528102796
event_count_explosions: 0.005961185503245342
event_count_violence: 0.012126790205449
fatalities_battles: 0.015680986178609902
fatalities_explosions: 0.0046098171832989485
fatalities_violence: 0.013381534232797521
nearest_neighbor_distance: 0.05610880748445266
GOSIF_GPP: 0.07344806907332554
rainfall_chirps: 0.04741022597118192
GOSIF_GPP_SD: 0.06639631894808014
temperature_2m_mean: 0.08964808953750435
temperature_2m_mean_sd: 0.1769367730707801
shortwave_radiation_sum: 0.041359801434957155
shortwave_radiation_sum_sd: 0.03875526182246983
precipitation_sum: 0.030892507078744157
precipitation_sum_sd: 0.032593269363811544
price_index: 0.1174419749178481
