In [None]:
#!pip install dowhy

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from dowhy import CausalModel

%matplotlib inline

In [None]:
# (1) What is the causal effect of AveRooms on MedHouseVal?
# (2) How does it compare with the linear coefficient found in part1_regression_sklearn.jpynb

In [None]:
# Step 0 - Load data
ds = datasets.fetch_california_housing()
X = ds.data
y = ds.target

# remove very cheap or very expensive homes (saturates =< 0.15 or >= 5)
ind = (y > 0.15) & (y < 5)
X = X[ind,:]
y = y[ind]

# transform target - more Gaussian
y = np.log(y)

# scale attributes
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


df = pd.DataFrame(X_scaled, columns=ds.feature_names)
df[ds.target_names[0]] = y
df

In [None]:
# Step 1 - Build the causal graph and model the problem
causal_graph = """
graph
[
  directed 1
"""

# add all the nodes
for c in df.columns:
    causal_graph += f"""
      node
      [
       id {c}
       label "{c}"
      ]    
    """

# add edges from all the features to the target
t = ds.target_names[0]
for f in ds.feature_names:
    causal_graph += f"""
      edge
      [
       source {f}
       target {t}
       label "{f} -> {t}"
      ]  
    """    
    
# add location as a common source
for s in ['Latitude', 'Longitude']:
    for t in ['MedInc', 'HouseAge', 'Population']:
        causal_graph += f"""
          edge
          [
           source {s}
           target {t}
           label "{s} -> {t}"
          ]  
        """      

causal_graph += f"""
  edge
  [
   source HouseAge
   target AveRooms
   label "HouseAge -> AveRooms"
  ]
  edge
  [
   source HouseAge
   target AveBedrms
   label "HouseAge -> AveBedrms"
  ]  
"""  
        
causal_graph += """  
]
"""

print(causal_graph)

In [None]:
model = CausalModel(df,
                    treatment=["AveRooms"], outcome="MedHouseVal",
                    graph=causal_graph,
                    missing_nodes_as_confounders=False)

model.view_model()

In [None]:
# Step 2 - Identify direct and indirect effects
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
print(identified_estimand)

In [None]:
# Step 3 - Estimate the effects
linear_estimate = model.estimate_effect(identified_estimand,
                                        method_name="backdoor.linear_regression",
                                        control_value=0,
                                        treatment_value=1)
print("Causal Estimate is " + str(linear_estimate.value))

In [None]:
# Step 4 - Refute the obtained estimate using multiple robustness checks.
refute_results = model.refute_estimate(identified_estimand, linear_estimate,
                                       method_name="random_common_cause")
print(refute_results)