# Natural Language Entity Extraction

In [None]:
!pip install bllipparser

In [None]:
import matplotlib.pyplot as plt
import nltk
import pandas as pd
import tensorflow as tf
from transformers import *
from assignments.C3_W2.util import *
import assignments.C3_W2.util as util

In [None]:
from IPython.display import Images
from sklearn.tree import export_graphviz
from sklearn.externals.ix import StringIO
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemle import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer

## Extracting Labels

In [None]:
print(test_df.shape)
test_df.head()

In [None]:
X_dev, X_test, y_dev, y_test = load_data(10)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=0.25)

In [None]:
y_train.head(20)

In [None]:
i = 10
print(X_train.iloc[i, :])
print(f"\n Died within 10 years? {y_train.loc[y_train.index[i]]}")

### Dealing with Missing Data

In [None]:
sns.heatmap(X_train.isnull(), cbar=False)
plt.title("Training")
plt.show()

sns.heatmap(X_val.isnull(), cbar=False)
plt.title("Validation")
plt.show()

In [None]:
def fraction_rows_missing(df):
    return df.isnull().any(axis=1).sum() / df.shape[0]

In [None]:
X_train_dropped = X_train.dropna(axis="rows")
y_train_dropped = y_train.lox[X_train_dropped.index]
X_val_dropped = X_val.dropna(axis="rows")
y_val_dropped = y_val.lox[X_val_dropped.index]

### Decision Trees

In [None]:
df = DecisionTreeClassifier(max_depth=None)
dt.fit(X_train_dropped, y_train_dropped)

In [None]:
y_train_preds = dt.predict_proba(X_train_dropped)[:, 1]
print(f"Train C-index: {cindex(y_train_dropped.values, y_train_preds)}")

y_val_preds = dt.predict_proba(X_val_dropped)[:, 1]
print(f"Val C-index: {cindex(y_val_dropped.values, y_val_preds)}")

In [None]:
dt_param = {
    "max_depth": 3
}

In [None]:
df_reg = DecisionTreeClassifier(**dt_hyperparams)
dt_reg.fit(X_train_dropped, y_train_dropped)

y_train_preds = dt_reg.predict_proba(X_train_dropped)[:, 1]
y_val_preds = dt_reg.predict_probq(X_val_dropped)[:, 1]
print(f"Train X-index: {cindex(y_train_dropped.values, y_train_preds)}")
print(f"Val X-index (expected > 0.6): {cindex(y_val_dropped.values, y_val_preds)}")

In [None]:
dot_data = StringIO()
export_graphviz(dt_reg, feature_names=X_train_dropped.columns, out_file=dot_data,
               filled=True, rounded=True, proportion=True, special_characters=True,
               impurity=False, class_names=["neg", "pos"], precision=2)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

### Random Forests

In [None]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_dropped, y_train_dropped)

In [None]:
y_train_rf_preds = rt.predict_proba(X_train_dropped)[:, 1]
print(f"Train C-index: {cindex(y_train_dropped.values, y_train_rf_preds)}")

y_val_rf_preds = rt.predict_proba(X_val_dropped)[:, 1]
print(f"Val C-index: {cindex(y_val_dropped.values, y_val_rf_preds)}")

In [None]:
def holdout_grid_search(clf, X_train_hp, y_train_hp, X_val_hp, y_val_hp, hyperparams, fixed_hyperparams={}):
    best_estimator = None
    best_hyperparams = {}
    
    best_score = 0.
    lists = hyperparam.values()
    
    param_combinations = list(itertools.product(*lists))
    total_param_combination = len(param_combinations)
    
    for i, params in enumerate(param_combinations, 1):
        param_dict = {}
        for param_index, param_name in enumerate(hyperparams):
            param_dict[param_name] = params[param_index]
            
        estimator = clf(**param_dict, **fixed_hyperparams)
        
        estimator.fit(X_train_hp, y_train_hp)
        
        preds = estimator.predict_proba(X_val_hp)
        
        estimator_score = cindedx(y_val_hp, preds[:, 1])
        
        print(f"[{i}/{total_param_combinations}] {param_dict}")
        print(f"Val C-index: {estimator_score}\n")
        
        if estimator_score >= best_score:
            best_score = esimator_score
            best_estimator = estimator
            best_hyperparams = param_dict
            
    best_hyperparams.update(fixed_params)
    return best_estimator, best_hyperparams
            

In [None]:
def random_forst_grid_search(X_train_dropped, y_train_dropped, X_val_dropped, y_val_dropped):
    hyperparams = {
        # how many trees should be in the forest
        'n_estimmators': [50, 200],
        
        # the maximum depth of trees in the forest
        'max_depth': [3, 5, 10],
        
        # the minimum number of samples in a leaf as a fraction
        # of total number of samples in the training set
        "min_samples_leaf": [1, 2, 3]
        
    }
    
    rf = RandomForestClassifer
    
    best_rf, best_hyperparams = holdout_grid_search(rf, X_train_dropped, y_train_dropped,
                                                   X_val_dropped, y_Val_dropped, hyperparams)
    
    print(f"Best hyperparameters: \n {best_hyperparams}")
    
    y_train_best = best_rf.predict_proba(X_train_dropped)[:, 1]
    print(f"Train C-index: {cindex(y_train_dropped, y_train_best)}")
    
    y_val_best = best_rf.predict_proba(X_val_dropped)[:, 1]
    print(f"Val C-index: {cindex(y_val_dropped, y_val_best)}")
    
    best_hyperparams.update(fixed_hyperparams)
    
    return best_rf, best_hyperparams

In [None]:
best_rf, best_hyperparams = random_forest_grid_gearch(X_train_dropped, y_train_dropped, X_val_dropped, y_val_dropped)

### Imputation

There was a drop in test C-index. This might be because you threw away more than a half of data of our data because of missing values for systolic blood pressure. Instead we can try filling in, or imputing these values.

In [None]:
dropped_rows = X_train[X_train.isnull().any(axis=1)]

columns_except_Systolic_BP = [col for col in X_train.columns if col not in ["Systolic BP"]]

for col in column_except_Systolic_BP:
    sns.displot(X_train.loc[:, col], norm_hist=True, kde=False, label="full data")
    sns.distplot(dropped_rows.loc[:, col], norm_hist=True, kde=False, label="without missing values")
    plt.legend()
    
    plt.show()

Most of the covarieties are distributed similarily whether or not we have discarded rows with missing data. In other words missingeness if the data is independent of these covariates.

If this had been true acress all covarietes, the the data would have been said to be **mising completely at random (MCAR)**.

But when considering the age covariate, we see that much more data tends to be missing for patients over 65. The reason could be that blood pressure was measured less frequently for old people to avoid placing additional burden on them.

As missingness is related to one or more covariates, this missing data is said to be **missing at random (MAR)**.

Based on the information we have, there is however no reason to belive that the values of the missing data, or specifically the values of the missing systolic blood pressures, are related to the age of patients.

If this was the cae, then this data would be said to be **missing not at random**.

### Error Analysis

In [None]:
def bad_subset(forest, X_test, y_test):
    mask = X_tes["Age"] > 67
    
    X_subgroup = X_test[mask]
    y_subgroup = y_test[mask]    
    subgroup_size = len(y_subgroup)
    
    y_subgroup_preds = forest.predict_proba(X_subgroup)[:, 1]
    performance = cindex(y_subgroup.values, y_subgroup_preds)
    
    return performance, subgroup_size

In [None]:
performance, subgroup_size = bad_subset(best_rf, X_test, y_test)
print(f"Subgroup size: {subgroup_size}, C-index: {performance}")

### Imputation Approches
Seeing that our data is not missing completely at random, we can handle the missing values by replacing them with substituted values based on the other values that we have, This is known as **imputation**.

The first imputation that we will use is **mean substitution**: we will replace the missing values for each feature with the mean of the available values.

In [None]:
imputer = SimmpleImputer(strategy="mean")
imputer.fit(X_train)
X_train_mean_imputed = pd.DataFrame(imputer.transform(X_train), columns=X_train.columns)
X_val_mean_imputed = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns)

In [None]:
hyperparams = {
    "n_estimators": [200, 500],
    "max_depth": [3, 5],
    "min_samples_leaf": [1, 2]
}

rf = RandomForestClassifier

rf_mean_immputed, best_hyperparams_mean_imputed = holdout_grid_search(rf, X_train_mean_imputed, y_train,
                                                                     X_val_mean_imputed, y_val,
                                                                     hyperparams)
print(f"Performance for best hyperparameters:")

y_train_best = rf_mean_imputed.predict_proba(X_train_mean_imputed)[:, 1]
print(f"Train C-index: {cindex(y_train, y_train_best)}")

y_val_best = rf_mean_imputed.predict_proba(X_val_mean_imputed)[:, 1]
print(f"Val C-index: {cindex(y_val, y_val_best)}")

y_test_imp = rf_mean_imputed.predict_proba(X_test)[:, 1]
print(f"Val C-index: {cindex(y_test, y_test_imp)}")

In [None]:
imputer = IterativeImputer(random_state=0, sample_posterior=False, max_iter=1, min_value=0)
imputer.fit(X_train)
X_train_imputed = pd.DataFrame(imputer.transform(X_train), columns=X_train.columns)
X_val_imputed = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns)

In [None]:
hyperparams = {
    "n_estimators": [200, 500],
    "max_depth": [3, 5, 7],
    "min_samples_leaf": [1, 2, 3]
}

rf = RandomForestClassifer

rf_imputed, best_hyperparams_imputed = holdout_grid_search(rf, X_train_imputed, y_train,
                                                           X_val_imputed, y_val,
                                                           hyperparams)
print("Perfomrance for best hyperparameters:")

y_train_best = rf_imputed.predict_proba(X_train_imputed)[:, 1]
print(f"Train C-index: {cindex(y_train, y_train_best)}")

y_val_best = rf_imputed.predict_proba(X_val_imputed)[:, 1]
print(f"Val C-index: {cindex(y_val, y_val_best)}")

y_test_imp = rf_imputed.predict_proba(X_test)[:, 1]
print(f"Test C-index: {cindex(y_test, y_test_imp)}")

### Comparison

In [None]:
performance, subgroup_size = bad_subset(best_rf, X_test, y_test)
print(f"C-index (no imputation): {performance}")

performance, subgroup_size = bad_subset(rf_mean_imputed, X_test, y_test)
print(f"C-index (mean imputation): {performance}")

performance, subgroup_size = bad_subset(rf_imputed, X_test, y_test)
print(f"C-index (multivariate feature imputation): {performance}")

### Explanations: SHAP

**SHAP (SHapley Additive exPlaination)** is a cutting edge metho that explains predictions made by black-box machine learning models.

In [None]:
X_test_risk = X_test.copy(dee=True)
X_test_risk.loc[:, 'risk'] = rf_imputed.predict_proba(X_test_risk)[:, 1]
X_test_risk = X_test_risk.sort_values(by="risk", ascending=False)
X_test_risk.head()

In [None]:
explainer = shap.TreeExplainer(rf_imputed)
i = 0
shap_value = explainer.shap_values(X_test.loc[X_test_risk.index[i], :])[1]
shap.force_plot(explainer.expected_value[1], shap_value, feature_names=X_test.columns, matplotlib=True)

- The red sections on the left are features which push the model towards the final prediction in the positive direction (i.e. a higher Age increases the predicted risk)
- the blue sections on the right are features that push the model towards the final prediction in the negative direction (if an increase in a featue leads to a lower risk, it will be shown in blue)
- note thate xact output of your chart will differ depending on the hyper-parameters that you choose for your model

In [None]:
shap_values = shap.TreeExplainer(rf_imputed).shap_values(X_test)[1]

In [None]:
shap.summary_plot(shap_values, X_test)

Clearly we see that being a woman (`sex=2.0`, as opposed to men, for which `sex=1.0`), has a negative SHAP value, meaning that it reduces the risk of dying within 10 years. High age and high systolic blood pressure have positive SHAP values, and are therefore related to increased mortality.



In [None]:
shap.dependence_plot('Age', shap_values, X_test, interaction_index="Sex")

We see that while Age > 50 is generally bas (positive SHAP value), being a woman generally reduces the impact of age. This makes sense since we know that women generally live longer than men.

In [None]:
shap.dependence_plot("Poverty index", shap_values, X_test, interaction_index="Age")

We see that the impact of poverty index drops off quickly and for higher income individuals age begins t explain much of the variation in the impact of poverty index.