In [17]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import shap


import os
for dirname, _, filenames in os.walk('/Users/amritambe/Desktop/Analysis_Project/Women_Empowerment/2 Data/Clean_Data/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/Users/amritambe/Desktop/Analysis_Project/Women_Empowerment/2 Data/Clean_Data/GDP_Cleaned.csv
/Users/amritambe/Desktop/Analysis_Project/Women_Empowerment/2 Data/Clean_Data/.DS_Store
/Users/amritambe/Desktop/Analysis_Project/Women_Empowerment/2 Data/Clean_Data/WE_Index_Clean.csv
/Users/amritambe/Desktop/Analysis_Project/Women_Empowerment/2 Data/Clean_Data/11_13_2023_WEI.csv
/Users/amritambe/Desktop/Analysis_Project/Women_Empowerment/2 Data/Clean_Data/11_18_2023_WEI_CleanImputed.csv
/Users/amritambe/Desktop/Analysis_Project/Women_Empowerment/2 Data/Clean_Data/Child_Leave_Qs_Clean.csv
/Users/amritambe/Desktop/Analysis_Project/Women_Empowerment/2 Data/Clean_Data/Fifty_Year_Change_Clean.csv
/Users/amritambe/Desktop/Analysis_Project/Women_Empowerment/2 Data/Clean_Data/GDP_Clean.csv
/Users/amritambe/Desktop/Analysis_Project/Women_Empowerment/2 Data/Clean_Data/All_Questions_Clean.csv
/Users/amritambe/Desktop/Analysis_Project/Women_Empowerment/2 Data/Clean_Data/11_10_2023_WEI.csv


In [18]:
df = pd.read_csv('/Users/amritambe/Desktop/Analysis_Project/Women_Empowerment/2 Data/Clean_Data/11_18_2023_WEI_CleanImputed.csv')
df.drop(columns='Unnamed: 0', axis=1, inplace=True)
df.columns.tolist()

['Country_Name',
 'Year',
 'GDP_Growth',
 'Index_1971',
 'Index_2020',
 'Fifty_Year_Change',
 'Region',
 'Income_Group',
 'Question_Category',
 'Question',
 'Index_Score',
 '2020_Data_Rank',
 '2020_1GB_Price(USD)',
 'Life_Exp',
 'Population',
 'GDP_Per_Cap',
 'Avg_WEI_Score']

In [42]:
df_qc = df.groupby(['Country_Name', 'Year', 
                    'Question_Category', 'Region', 
                    'Life_Exp', 'Population', 'GDP_Per_Cap', 'Avg_WEI_Score'])['Index_Score'].agg('sum').reset_index()

pivot_qc = df_qc.pivot_table(index=['Country_Name', 'Year', 
                                    'Region', 'Life_Exp', 
                                    'Population', 'GDP_Per_Cap', 'Avg_WEI_Score'],
                             columns='Question_Category',
                             values='Index_Score',
                             aggfunc='sum').reset_index()
pivot_qc.columns.tolist()

['Country_Name',
 'Year',
 'Region',
 'Life_Exp',
 'Population',
 'GDP_Per_Cap',
 'Avg_WEI_Score',
 'Assets',
 'Entrepreneurship',
 'Marriage',
 'Mobility',
 'Parenthood',
 'Pay',
 'Pension',
 'Workplace']

In [44]:
#organize data for modeling
target = 'Avg_WEI_Score'


X = pivot_qc[[ 
 'Year',
 'Region',
 'Life_Exp',
 'Population',
 'GDP_Per_Cap',
 # 'Avg_WEI_Score',
 'Assets',
 'Entrepreneurship',
 'Marriage',
 'Mobility',
 'Parenthood',
 'Pay',
 'Pension',
 'Workplace'
]]

y = pivot_qc[target]

#One Hot Encode
X = pd.get_dummies(X, columns=['Region'])

#order by year for time series split
X.sort_values(by='Year', inplace=True)

#Time Series Split
tscv = TimeSeriesSplit(n_splits=5, max_train_size=10)

fold=0

shap_values_list = []


for i, (train_idx, test_idx) in enumerate(tscv.split(X)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    

    #GridSearch   ####################################################### GRIDSEARCH ################################################
    model = xgb.XGBRegressor()
    
    #implement gridsearch parameters
    param_grid = {
        'n_estimators':[100, 500],
        'learning_rate':[.15, 0.1, 0.05],
        'max_depth':[ 8, 10],
        'alpha':[.15, .2],
        'early_stopping_rounds':[10]
    }
    
    #instantiate gridsearch cv
    grid_search = GridSearchCV(estimator=model,
                               param_grid=param_grid,
                               scoring='neg_mean_squared_error',
                               cv=5,
                               n_jobs=-1 #setting n_jobs to -1 uses all processors
                               )
    
    
    #implement gridsearch
    grid_search.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

    #print results of best_params_
    print(f'The best parameters found by GridSearch are:\n{grid_search.best_params_}')
    
    
    #define the best model with best_params_ ################################# EVALUATE MODEL ############################################
    best_model = grid_search.best_estimator_
    
    #predictions
    y_pred = best_model.predict(X_test)
    
    #scoring using y_test set and predictions
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    
    print(f'Best model mse: {mse:.4f}\n\
    Best model r2: {r2:.4f}\n\
    Best model mape: {mape:.4f}')

    #SHAP for feature importance and model interpretation ############################### SHAP VALUES ###################################  
    explainer = shap.Explainer(best_model)
    shap_values = explainer.shap_values(X_test)
    shap_values_list.append(shap_values)

    fold += 1

# fig, axs = plt.subplots(5, 1, figsize=(10, 25))

# for i, shap_value in enumerate(shap_values_list):
#         #set axis
#     ax = axs[i]
#     ax.clear()
#     ax.set_title(f'Shap summary {i + 1}')

#     shap = shap.summary_plot(shap_value, X, show=False)
#     ax.set(shap)


# plt.text(-63000,20, f'XGBoost Model\n\nModel mse: {mse:.4f}\nModel r2: {r2:.4f}\nModel mape: {mape:.4f}', fontdict={'fontsize':16})
# plt.text(-35000,20, f"{target}:\n{df[target].agg(['min', 'max', 'mean', 'std']).round(2).to_string(index=True)}", fontdict={'fontsize':16})

# plt.tight_layout()
# plt.show()
# shap_q_gdp_percap = plt.gcf()

The best parameters found by GridSearch are:
{'alpha': 0.15, 'early_stopping_rounds': 10, 'learning_rate': 0.15, 'max_depth': 8, 'n_estimators': 100}
Best model mse: 13.6939
    Best model r2: -0.5311
    Best model mape: 0.2188




The best parameters found by GridSearch are:
{'alpha': 0.15, 'early_stopping_rounds': 10, 'learning_rate': 0.15, 'max_depth': 8, 'n_estimators': 100}
Best model mse: 16.2393
    Best model r2: -0.2472
    Best model mape: 0.3256




The best parameters found by GridSearch are:
{'alpha': 0.15, 'early_stopping_rounds': 10, 'learning_rate': 0.15, 'max_depth': 8, 'n_estimators': 100}
Best model mse: 9.6569
    Best model r2: -0.0001
    Best model mape: 0.2100




The best parameters found by GridSearch are:
{'alpha': 0.15, 'early_stopping_rounds': 10, 'learning_rate': 0.15, 'max_depth': 8, 'n_estimators': 100}
Best model mse: 9.7517
    Best model r2: -0.0598
    Best model mape: 0.2311




The best parameters found by GridSearch are:
{'alpha': 0.15, 'early_stopping_rounds': 10, 'learning_rate': 0.15, 'max_depth': 8, 'n_estimators': 100}
Best model mse: 19.8016
    Best model r2: -0.4070
    Best model mape: 0.3939




In [48]:
shap_values_list

AttributeError: 'list' object has no attribute 'describe'