# A Basic Model As A Starting Point
<!--- @wandbcode{decisionopt-nb4a} -->

Model building isn't our focus, so I won't go into great depth on this. But it gives a model to use

In [None]:
%pip install shap -qqq

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import wandb
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.compose import make_column_selector as selector
from wandb.xgboost import WandbCallback
from xgboost import XGBClassifier

os.environ["WANDB_NOTEBOOK_NAME"] = "ad_hoc_adjustments.ipynb"
plt.style.use('fivethirtyeight')

In [None]:
# We will load dataset from wandb Artifact
with wandb.init(project="ad_hoc_adjustments") as run:
    artifact = run.use_artifact('wandb_course/decision_opt/telco-customer-churn:latest', type='dataset')
    artifact_dir = artifact.download()
    path = Path(artifact_dir)

In [None]:
data = pd.read_csv(path/"WA_Fn-UseC_-Telco-Customer-Churn.csv")
data.head()

In [None]:
run = wandb.init(project="ad_hoc_adjustments")

target = "Churn"
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(target, axis=1), data[target] == "Yes", test_size=0.2, random_state=0
)
cols_to_use = [
    "tenure",
    "PhoneService",
    "MultipleLines",
    "InternetService",
    "OnlineSecurity",
    "OnlineBackup",
    "DeviceProtection",
    "TechSupport",
    "StreamingTV",
    "StreamingMovies",
    "Contract",
    "PaperlessBilling",
    "PaymentMethod",
    "MonthlyCharges",
]

preprocessor = ColumnTransformer(
    transformers=[("one_hot", OneHotEncoder(), selector(dtype_include="object"))],
    remainder="passthrough",  # Leave numerical variables unchanged
)

# Create pipeline
pipeline = Pipeline([("preprocessor", preprocessor), ("classifier", XGBClassifier())])
pipeline.fit(X_train[cols_to_use], y_train)
y_pred = pipeline.predict_proba(X_test[cols_to_use])[:, 1]
roc_auc = roc_auc_score(y_test, y_pred)
log_loss_val = log_loss(y_test, y_pred)

In [None]:
X_train[cols_to_use].head()

In [None]:
def baseline_prediction(data):
    return pipeline.predict_proba(data[cols_to_use])[:, 1]

def prediction_adjust_DSL(data):
    baseline = baseline_prediction(data)
    has_DSL = data["InternetService"] == "DSL"
    out = baseline + has_DSL * 0.1
    return out

prediction_adjust_DSL(X_test) - baseline_prediction(X_test)

In [None]:
import shap

encoded_value = pipeline.named_steps["preprocessor"].transform(X_train[cols_to_use])
explainer = shap.TreeExplainer(pipeline.named_steps["classifier"], data=encoded_value)

In [None]:
def prediction_with_less_effect_for_tenure(data):
    tenure_column = 0
    effect_reduction_size = 0.5

    prediction = baseline_prediction(data)
    encoded_data = pipeline.named_steps["preprocessor"].transform(data[cols_to_use])
    shap_values = explainer.shap_values(encoded_data)
    effect_for_tenure = shap_values[:, tenure_column]
    adjusted_predictions = prediction - effect_for_tenure * effect_reduction_size
    return adjusted_predictions

prediction_with_less_effect_for_tenure(X_test) - baseline_prediction(X_test)

In [None]:
run.finish()