# TTE R TO PYTHON

## 1. Setup 

A sequence of target trials analysis starts by specifying which estimand will be used:

### Python Equivalent:

In [216]:
import os
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
import pickle
import statsmodels.genmod.generalized_linear_model as sm
sm.SET_USE_BIC_LLF(True)

# Create directories
trial_pp_dir = os.path.join(os.getcwd(), "trial_pp")
trial_itt_dir = os.path.join(os.getcwd(), "trial_itt")
os.makedirs(trial_pp_dir, exist_ok=True)
os.makedirs(trial_itt_dir, exist_ok=True)

# Setup trial sequences as dictionaries
trial_pp = {"estimand": "PP", "dir": trial_pp_dir}
trial_itt = {"estimand": "ITT", "dir": trial_itt_dir}

Used dictionaries to store trial settings and os to create directories. 
Since there's no direct equivalent to trial_sequence(), so I mimiced the structure at 

<div class="alert alert-block alert-warning">
os.makedirs(..., exist_ok=True) ensures directories are created without raising an error if they already exist. <br>

The trial objects (trial_pp, trial_itt) are dictionaries that will hold all trial-related data and settings.
</div>


___


## 2. Data Preparation

Next the user must specify the observational input data that will be used for the target trial emulation. Here we need to specify which columns contain which values and how they should be used. In the original R code, a parameter called ``` time_on_regime``` is shown on the other steps, however it is only retrieved through ``` switch_n_cov```: 

<div class="alert alert-block alert-info">
switch_n_cov<br>
A RHS formula to specify the logistic models for estimating the numerator terms of the inverse probability of treatment weights. A derived variable named time_on_regime containing the duration of time that the individual has been on the current treatment/non-treatment is available for use in these models.
</div>

Upon checking further, time_on_regime isn't used in any significant functions, so I decided to omit it. 

### Python Equivalent:

In [217]:
# Load data (assuming it's saved as a CSV)
data_censored = pd.read_csv("data_censored.csv")

columns = {
    "id": "id",
    "period": "period",
    "treatment": "treatment",
    "outcome": "outcome",
    "eligible": "eligible"
}

# Assign data and columns to trials
trial_pp["data"] = data_censored.copy()
trial_pp["columns"] = columns

trial_itt["data"] = data_censored.copy()
trial_itt["columns"] = columns

# Display the first 6 rows (equivalent to head() in R showing 6 rows)
print(data_censored.head(6))

   id  period  treatment  x1        x2  x3        x4  age     age_s  outcome  \
0   1       0          1   1  1.146148   0  0.734203   36  0.083333        0   
1   1       1          1   1  0.002200   0  0.734203   37  0.166667        0   
2   1       2          1   0 -0.481762   0  0.734203   38  0.250000        0   
3   1       3          1   0  0.007872   0  0.734203   39  0.333333        0   
4   1       4          1   1  0.216054   0  0.734203   40  0.416667        0   
5   1       5          1   0 -0.057482   0  0.734203   41  0.500000        0   

   censored  eligible  
0         0         1  
1         0         0  
2         0         0  
3         0         0  
4         0         0  
5         1         0  


Loaded the data from the CSV file and store it in the trial dictionaries. 

Used ```python pd.read_csv()``` instead of ```R data("data_censored")```

In [218]:
def set_data(trial, data, id, period, treatment, outcome, eligible):
    trial["data"] = data.copy()
    trial["columns"] = {
        "id": id,
        "period": period,
        "treatment": treatment,
        "outcome": outcome,
        "eligible": eligible
    }
    trial["n_observations"] = len(data)
    trial["n_patients"] = data[id].nunique()
    
    # Initialize missing keys with default status
    trial["censor_weights"] = {"status": "Not specified"}
    trial["expansion"] = {"status": "Not specified"}
    trial["outcome_model"] = {"status": "Not specified"}
    
    return trial

# Function to display trial object summary
def print_trial_summary(trial):
    print("Trial Sequence Object")
    print(f"Estimand: {trial['estimand']}")
    print("")
    print("Data:")
    print(f" - N: {trial['n_observations']} observations from {trial['n_patients']} patients")
    
    # Select the desired columns
    display_cols = ["id", "period", "treatment", "x1", "x2", "x3", "x4", "age", "age_s", 
                    "outcome", "censored", "eligible"]
    data_display = pd.concat([trial["data"][display_cols].head(2), trial["data"][display_cols].tail(2)])
    
    # Define column types
    col_types = {col: "<int>" if trial["data"][col].dtype == "int64" else "<num>" for col in display_cols}
    
    # Print column headers and types
    print("        " + " ".join(f"{col:>11}" for col in display_cols))
    print("        " + " ".join(f"{col_types[col]:>11}" for col in display_cols))
    
    # Print rows dynamically with their actual indices
    for i, (idx, row) in enumerate(data_display.iterrows()):
        # Right-align the index with a colon, adjusting width to fit larger numbers
        label = f"{idx:>4}:"
        print(label, end=" ")
        formatted_row = [f"{row[col]:>11.6f}" if col_types[col] == "<num>" else f"{int(row[col]):>11}" for col in display_cols]
        print(" ".join(formatted_row))
        # Add ellipsis after the second row (before the last two)
        if i == 1:
            print("    ...")
    
    print("")
    print("IPW for informative censoring:")
    print(f" - {trial['censor_weights']['status']}")
    print("")
    print("Sequence of Trials Data:")
    print(f" - {trial['expansion']['status']}")
    print("")
    print("Outcome model:")
    print(f" - {trial['outcome_model']['status']}")

# Per-protocol
trial_pp = set_data(
    trial_pp,
    data=data_censored,
    id="id",
    period="period",
    treatment="treatment",
    outcome="outcome",
    eligible="eligible"
)

# ITT
trial_itt = set_data(
    trial_itt,
    data=data_censored,
    id="id",
    period="period",
    treatment="treatment",
    outcome="outcome",
    eligible="eligible"
)

# Display the first 6 rows
print("First 6 rows of data_censored:")
print(data_censored.head(6))

# Display trial_itt summary
print("\nTrial ITT Summary:")
print_trial_summary(trial_itt)

First 6 rows of data_censored:
   id  period  treatment  x1        x2  x3        x4  age     age_s  outcome  \
0   1       0          1   1  1.146148   0  0.734203   36  0.083333        0   
1   1       1          1   1  0.002200   0  0.734203   37  0.166667        0   
2   1       2          1   0 -0.481762   0  0.734203   38  0.250000        0   
3   1       3          1   0  0.007872   0  0.734203   39  0.333333        0   
4   1       4          1   1  0.216054   0  0.734203   40  0.416667        0   
5   1       5          1   0 -0.057482   0  0.734203   41  0.500000        0   

   censored  eligible  
0         0         1  
1         0         0  
2         0         0  
3         0         0  
4         0         0  
5         1         0  

Trial ITT Summary:
Trial Sequence Object
Estimand: ITT

Data:
 - N: 725 observations from 89 patients
                 id      period   treatment          x1          x2          x3          x4         age       age_s     outcome    censor

---

## 3. Weight models and censoring

In [219]:
# --- Section 3: Weight Models and Censoring ---
def set_weight_models(trial, weight_type, censor_event=None, numerator_vars=None, denominator_vars=None, pool_models="none", save_path=None):
    # Store configuration without fitting models yet
    trial[f"{weight_type}_config"] = {
        "censor_event": censor_event,
        "numerator_vars": numerator_vars,
        "denominator_vars": denominator_vars,
        "pool_models": pool_models,
        "save_path": save_path,
        "fitted": False
    }

def show_weight_config(trial, weight_type):
    config = trial.get(f"{weight_type}_config", {})
    if not config:
        return
    print(f"Numerator formula: 1 - censored ~ {' + '.join(config['numerator_vars']) if weight_type == 'censor' else 'treatment ~ ' + ' + '.join(config['numerator_vars'])}")
    print(f"Denominator formula: 1 - censored ~ {' + '.join(config['denominator_vars']) if weight_type == 'censor' else 'treatment ~ ' + ' + '.join(config['denominator_vars'])}")
    if weight_type == "censor" and config["pool_models"] == "numerator":
        print("Numerator model is pooled across treatment arms. Denominator model is not pooled.")
    print("Model fitter type: te_stats_glm_logit")
    print("Weight models not fitted. Use calculate_weights()")

# Set and show switch weight models (PP only)
set_weight_models(
    trial_pp,
    weight_type="switch",
    numerator_vars=["age"],
    denominator_vars=["age", "x1", "x3"],
    save_path=os.path.join(trial_pp["dir"], "switch_models")
)
print("Switch Weight Config for trial_pp:")
show_weight_config(trial_pp, "switch")

# Set and show censor weight models (PP)
set_weight_models(
    trial_pp,
    weight_type="censor",
    censor_event="censored",
    numerator_vars=["x2"],
    denominator_vars=["x2", "x1"],
    pool_models="none",
    save_path=os.path.join(trial_pp["dir"], "switch_models")
)
print("\nCensor Weight Config for trial_pp:")
show_weight_config(trial_pp, "censor")

# Set and show censor weight models (ITT)
set_weight_models(
    trial_itt,
    weight_type="censor",
    censor_event="censored",
    numerator_vars=["x2"],
    denominator_vars=["x2", "x1"],
    pool_models="numerator",
    save_path=os.path.join(trial_itt["dir"], "switch_models")
)
print("\nCensor Weight Config for trial_itt:")
show_weight_config(trial_itt, "censor")

Switch Weight Config for trial_pp:
Numerator formula: 1 - censored ~ treatment ~ age
Denominator formula: 1 - censored ~ treatment ~ age + x1 + x3
Model fitter type: te_stats_glm_logit
Weight models not fitted. Use calculate_weights()

Censor Weight Config for trial_pp:
Numerator formula: 1 - censored ~ x2
Denominator formula: 1 - censored ~ x2 + x1
Model fitter type: te_stats_glm_logit
Weight models not fitted. Use calculate_weights()

Censor Weight Config for trial_itt:
Numerator formula: 1 - censored ~ x2
Denominator formula: 1 - censored ~ x2 + x1
Numerator model is pooled across treatment arms. Denominator model is not pooled.
Model fitter type: te_stats_glm_logit
Weight models not fitted. Use calculate_weights()


---

## 4. Calculate Weights

In [220]:
def fit_weight_models(trial, weight_type):
    """
    Fit numerator and denominator models for switch or censor weights based on trial configuration.
    """
    config = trial[f"{weight_type}_config"]
    data = trial["data"].copy()
    models = {}
    
    if weight_type == "switch":
        # Compute previous treatment for switch weights
        data['prev_treatment'] = data.groupby('id')['treatment'].shift(1).fillna(0).astype(int)
        groups = data.groupby('prev_treatment')
        outcome = 'treatment'
        formula_num = f"{outcome} ~ {' + '.join(config['numerator_vars'])}"
        formula_den = f"{outcome} ~ {' + '.join(config['denominator_vars'])}"
        for group_name, group_data in groups:
            model_num = smf.glm(formula_num, data=group_data, family=sm.families.Binomial()).fit()
            model_den = smf.glm(formula_den, data=group_data, family=sm.families.Binomial()).fit()
            models[f"n{group_name}"] = model_num
            models[f"d{group_name}"] = model_den
    
    elif weight_type == "censor":
        outcome = config["censor_event"]
        # Define outcome as not censored (1 - censor_event)
        data['not_censored'] = 1 - data[outcome]
        # Compute previous treatment for censor weights
        data['prev_treatment'] = data.groupby('id')['treatment'].shift(1).fillna(0).astype(int)
        if config["pool_models"] == "numerator":  # ITT case
            # Pooled numerator model across all data
            formula_num = f"not_censored ~ {' + '.join(config['numerator_vars'])}"
            model_num = smf.glm(formula_num, data=data, family=sm.families.Binomial()).fit()
            models["n"] = model_num
            # Separate denominator models by prev_treatment
            formula_den = f"not_censored ~ {' + '.join(config['denominator_vars'])}"
            for group_name, group_data in data.groupby('prev_treatment'):
                model_den = smf.glm(formula_den, data=group_data, family=sm.families.Binomial()).fit()
                models[f"d{group_name}"] = model_den
        else:  # PP case, no pooling
            formula_num = f"not_censored ~ {' + '.join(config['numerator_vars'])}"
            formula_den = f"not_censored ~ {' + '.join(config['denominator_vars'])}"
            for group_name, group_data in data.groupby('prev_treatment'):
                model_num = smf.glm(formula_num, data=group_data, family=sm.families.Binomial()).fit()
                model_den = smf.glm(formula_den, data=group_data, family=sm.families.Binomial()).fit()
                models[f"n{group_name}"] = model_num
                models[f"d{group_name}"] = model_den
    
    trial[f"{weight_type}_models"] = models
    config["fitted"] = True

def calculate_weights(trial, weight_type):
    """
    Calculate weights using fitted models. Fit models if not already fitted.
    """
    if not trial[f"{weight_type}_config"]["fitted"]:
        fit_weight_models(trial, weight_type)
    
    data = trial["data"].copy()
    models = trial[f"{weight_type}_models"]
    
    if weight_type == "switch":
        if 'prev_treatment' not in data.columns:
            data['prev_treatment'] = data.groupby('id')['treatment'].shift(1).fillna(0).astype(int)
        weights = pd.Series(index=data.index, dtype=float)
        for group_name in [0, 1]:
            group_data = data[data['prev_treatment'] == group_name]
            idx = group_data.index
            prob_num = models[f"n{group_name}"].predict(group_data)
            prob_den = models[f"d{group_name}"].predict(group_data)
            weights.loc[idx] = np.where(group_data['treatment'] == 1, prob_num / prob_den, (1 - prob_num) / (1 - prob_den))
        data["wt"] = weights
    
    elif weight_type == "censor":
        if 'prev_treatment' not in data.columns:
            data['prev_treatment'] = data.groupby('id')['treatment'].shift(1).fillna(0).astype(int)
        weights = pd.Series(index=data.index, dtype=float)
        if "n" in models:  # ITT with pooled numerator
            prob_num = models["n"].predict(data)
            for group_name in [0, 1]:
                group_data = data[data['prev_treatment'] == group_name]
                idx = group_data.index
                prob_den = models[f"d{group_name}"].predict(group_data)
                weights.loc[idx] = prob_num.loc[idx] / prob_den
        else:  # PP with separate models
            for group_name in [0, 1]:
                group_data = data[data['prev_treatment'] == group_name]
                idx = group_data.index
                prob_num = models[f"n{group_name}"].predict(group_data)
                prob_den = models[f"d{group_name}"].predict(group_data)
                weights.loc[idx] = prob_num / prob_den
        data["wtC"] = weights
    
    trial["data"] = data

def show_weight_models(trial):
    """
    Display summaries of fitted weight models with specified descriptions and order.
    """
    weight_types = ["censor", "switch"] if trial["estimand"] == "PP" else ["censor"]
    for weight_type in weight_types:
        if f"{weight_type}_models" not in trial:
            continue
        print(f"Weight Models for {'Informative Censoring' if weight_type == 'censor' else 'Treatment Switching'}")
        print("---------------------------------------\n")
        models = trial[f"{weight_type}_models"]
        
        # Set display order based on estimand
        if weight_type == "censor":
            if trial[f"{weight_type}_config"]["pool_models"] == "numerator":
                order = ["n", "d0", "d1"]  # ITT order
            else:
                order = ["n0", "d0", "n1", "d1"]  # PP order
        else:  # switch weights for PP
            order = ["n1", "d1", "n0", "d0"]
        
        for model_key in order:
            if model_key in models:
                model = models[model_key]
                is_numerator = model_key.startswith('n')
                group_name = model_key[1:] if model_key != "n" else ""
                if weight_type == "switch":
                    desc = f"P(treatment = 1 | previous treatment = {group_name}) for {'numerator' if is_numerator else 'denominator'}"
                else:
                    if model_key == "n":
                        desc = "P(censor_event = 0 | X) for numerator"  # ITT pooled numerator
                    else:
                        desc = f"P(censor_event = 0 | X, previous treatment = {group_name}) for {'numerator' if is_numerator else 'denominator'}"
                print(f"[{model_key}]")
                print(f"Model: {desc}")
                print(" ")
                print("term        estimate   std.error statistic p.value")
                params = model.params
                std_err = model.bse
                tvalues = model.tvalues
                pvalues = model.pvalues
                for term in params.index:
                    print(f"{term:<12} {params[term]:>9.7f} {std_err[term]:>9.7f} {tvalues[term]:>9.6f} {pvalues[term]:.6e}")
                print(" ")
                print(f"null.deviance df.null logLik    AIC      BIC      deviance df.residual nobs")
                print(f"{model.null_deviance:.7f} {int(model.nobs-1):>7} {model.llf:>9.4f} {model.aic:>7.4f} {model.bic:>9.4f} {model.deviance:>7.4f} {int(model.df_resid):>7} {int(model.nobs):>7}")
                print(" \n")
# Execute calculations
calculate_weights(trial_pp, "switch")
calculate_weights(trial_pp, "censor")
calculate_weights(trial_itt, "censor")

# Display results
print("Weight Models for trial_itt:")
show_weight_models(trial_itt)
print("\nWeight Models for trial_pp:")
show_weight_models(trial_pp)

Weight Models for trial_itt:
Weight Models for Informative Censoring
---------------------------------------

[n]
Model: P(censor_event = 0 | X) for numerator
 
term        estimate   std.error statistic p.value
Intercept    2.4480907 0.1405747 17.414876 6.362614e-68
x2           -0.4486482 0.1368779 -3.277724 1.046476e-03
 
null.deviance df.null logLik    AIC      BIC      deviance df.residual nobs
404.2155891     724 -196.7002 397.4004  406.5727 393.4004     723     725
 

[d0]
Model: P(censor_event = 0 | X, previous treatment = 0) for denominator
 
term        estimate   std.error statistic p.value
Intercept    1.8941961 0.2071136  9.145686 5.925282e-20
x2           -0.5898292 0.1693423 -3.483059 4.957189e-04
x1           0.8552603 0.3452990  2.476869 1.325407e-02
 
null.deviance df.null logLik    AIC      BIC      deviance df.residual nobs
283.0722903     425 -132.1655 270.3309  282.4943 264.3309     423     426
 

[d1]
Model: P(censor_event = 0 | X, previous treatment = 1) for den

---

In [221]:
# Step 5: Specify Outcome Model
def set_outcome_model(trial, adjustment_terms=None):
    base_formula = "outcome ~ assigned_treatment"
    
    if trial["estimand"] == "PP" and "switch_config" in trial:
        numerator_vars = trial["switch_config"]["numerator_vars"]
        additional_terms = " + ".join(numerator_vars)
        trial["outcome_model"] = f"{base_formula} + {additional_terms}"
    elif adjustment_terms:
        trial["outcome_model"] = f"{base_formula} + {adjustment_terms}"
    else:
        trial["outcome_model"] = base_formula
    
    print(f"Outcome model for {trial['estimand']}: {trial['outcome_model']}")
    return trial

trial_pp = set_outcome_model(trial_pp)
trial_itt = set_outcome_model(trial_itt, adjustment_terms="x2")

# Step 6: Expand Trials
def set_expansion_options(trial, chunk_size=500, output_format="dataframe"):
    trial["expansion_options"] = {
        "chunk_size": chunk_size,
        "output_format": output_format
    }
    print(f"Expansion options set for {trial['estimand']}: chunk_size={chunk_size}, output_format={output_format}")
    return trial

def expand_trials(trial):
    data = trial["data"]
    chunk_size = trial["expansion_options"]["chunk_size"]
    expanded_data = []
    
    for start in range(0, len(data), chunk_size):
        chunk = data.iloc[start:start + chunk_size].copy()
        for treatment in [0, 1]:
            chunk_copy = chunk.copy()
            chunk_copy["assigned_treatment"] = treatment
            # Ensure trial_period is present
            if "period" in chunk_copy.columns:
                chunk_copy["trial_period"] = chunk_copy["period"]
            else:
                chunk_copy["trial_period"] = start // chunk_size  # Example: period based on chunk index
            expanded_data.append(chunk_copy)
    
    trial["expanded_data"] = pd.concat(expanded_data, ignore_index=True)
    print(f"Expanded data for {trial['estimand']}: {len(trial['expanded_data'])} rows")
    return trial

trial_pp = set_expansion_options(trial_pp, chunk_size=500)
trial_itt = set_expansion_options(trial_itt, chunk_size=500)

trial_pp = expand_trials(trial_pp)
trial_itt = expand_trials(trial_itt)

# Initialize trial_itt
trial_itt = {"estimand": "ITT", "dir": trial_itt_dir}

# Ensure all functions return the updated trial
trial_itt = set_data(trial_itt, data_censored, "id", "period", "treatment", "outcome", "eligible")
# Add print to debug
print("trial_itt before set_outcome_model:", trial_itt)

# Call set_outcome_model
trial_itt = set_outcome_model(trial_itt, adjustment_terms="x2")

def display_expanded_data(trial, chunk_size):
    """
    Display the expanded trial data in a format similar to the R output.
    
    Parameters:
    - trial: Dictionary containing trial settings and expanded data (as a DataFrame)
    - chunk_size: The chunk size used for expansion
    """
    # Extract the expanded data DataFrame
    expanded_data = trial["expanded_data"]
    
    # Define the columns to display (based on the R output)
    display_columns = ["id", "trial_period", "followup_time", "outcome", "weight", 
                       "treatment", "x2", "age", "assigned_treatment"]
    
    # Check if all required columns exist in the data
    for col in display_columns:
        if col not in expanded_data.columns:
            print(f"Warning: Column '{col}' not found in expanded data.")
            return
    
    # Create a subset of the data with the selected columns
    display_data = expanded_data[display_columns]
    
    # Display summary information
    print("## Sequence of Trials Data: ")
    print(f"## - Chunk size: {chunk_size} ")
    print("## - Censor at switch: TRUE ")
    print("## - First period: 0 | Last period: Inf ")
    print("##  ")
    print("## A TE Datastore Datatable object ")
    print(f"## N: {len(display_data)} observations ")
    
    # Display the first two and last two rows
    head = display_data.head(2)
    tail = display_data.tail(2)
    display_df = pd.concat([head, tail])
    
    # Print column headers with fixed width
    print("## ", end="")
    for col in display_columns:
        print(f"{col:<15}", end="")
    print()
    
    # Print data types based on column content
    print("## ", end="")
    for col in display_columns:
        if col in ["id", "trial_period", "followup_time", "outcome", "treatment", "assigned_treatment"]:
            print(f"{'<int>':<15}", end="")
        else:
            print(f"{'<num>':<15}", end="")
    print()
    
    # Print the data rows
    for idx, row in enumerate(display_df.itertuples(index=False), 1):
        # Adjust index for tail rows
        if idx > 2:
            idx = len(display_data) - (4 - idx)
        print(f"## {idx:>3}: ", end="")
        for i, col in enumerate(display_columns):
            value = getattr(row, col)
            if col in ["id", "trial_period", "followup_time", "outcome", "treatment", "assigned_treatment"]:
                print(f"{int(value):>10}", end=" ")
            else:
                print(f"{value:>10.7f}", end=" ")
        print()
        # Add separator after the second row
        if idx == 2:
            print("## ---")
            
print("PEEPEE")

display_expanded_data(trial_pp, chunk_size=500)

Outcome model for PP: outcome ~ assigned_treatment + age
Outcome model for ITT: outcome ~ assigned_treatment + x2
Expansion options set for PP: chunk_size=500, output_format=dataframe
Expansion options set for ITT: chunk_size=500, output_format=dataframe
Expanded data for PP: 1450 rows
Expanded data for ITT: 1450 rows
trial_itt before set_outcome_model: {'estimand': 'ITT', 'dir': 'c:\\Users\\User\\Documents\\GitHub\\Clustering-Assignment-1\\trial_itt', 'data':      id  period  treatment  x1        x2  x3        x4  age     age_s  \
0     1       0          1   1  1.146148   0  0.734203   36  0.083333   
1     1       1          1   1  0.002200   0  0.734203   37  0.166667   
2     1       2          1   0 -0.481762   0  0.734203   38  0.250000   
3     1       3          1   0  0.007872   0  0.734203   39  0.333333   
4     1       4          1   1  0.216054   0  0.734203   40  0.416667   
..   ..     ...        ...  ..       ...  ..       ...  ...       ...   
720  99       3         

---