# Using Cohort Generator

Example followed: https://github.com/OHDSI/CohortGenerator

by Anja van Gestel

In [None]:
## in R:
# remotes::install_github("OHDSI/CohortGenerator")

%load_ext autoreload
%autoreload 2

import os
os.environ["R_HOME"] = r"C:\Program Files\R\R-4.3.1" # change as needed
from pathlib import Path
import pandas as pd

from ohdsi import circe
from ohdsi import common
from ohdsi import cohort_generator
from ohdsi import database_connector

# Introduction

This R package contains functions for generating cohorts using data in the CDM.

# Features

- Create a cohort table and generate cohorts against an OMOP CDM.
- Get the count of subjects and events in a cohort.
- Provides functions for performing incremental tasks. This is used by CohortGenerator to skip any cohorts that were successfully generated in a previous run. This functionality is generic enough for other packages to use for performing their own incremental tasks.

# Your own parameters

For these tutorials I am using a PostgreSql database, which I created using Synthea (https://github.com/synthetichealth/synthea). My database is called 'synthea10', it has a schema 'cdm_synthea10' which contains the OMOP CDM tables, and another schema 'results' which I use for results/cohorts/etc.

For the code in this tutorial to work you should of course call your own database / schemas / user / password.

In [None]:
cdm_database = 'synthea10'
cdm_schema = 'cdm_synthea10'
results_schema = 'results'
user = 'postgres'
password = 'password'

# Connect to your database

In [None]:
connection_details = database_connector.create_connection_details(
    dbms = "postgresql",
    server = f"localhost/{cdm_database}",
    user = f"{user}",
    password = f"{password}"
)

conn = database_connector.connect(connection_details)

# Get cohort definition and query

In [None]:
cohort_json_filename = r"./input/death_cohort_with_concept_sets.json"
cohort_json = Path(cohort_json_filename).read_text()

cohort_expression = circe.cohort_expression_from_json(cohort_json)

options = circe.create_generate_options()
cohort_sql = circe.build_cohort_query(cohort_expression, options)

# Empty cohort definition set

In [None]:
cohorts_to_create = cohort_generator.create_empty_cohort_definition_set()
common.convert_from_r(cohorts_to_create)

# Fill the cohort definition set

In [None]:
cohorts_to_create = pd.DataFrame({
    'cohortId': 55,
    'cohortName': 'Death cohort', 
    'sql': cohort_sql
})

print(cohorts_to_create)

# Create cohort tables

In [None]:
table_names = cohort_generator.get_cohort_table_names()
res = cohort_generator.create_cohort_tables(
    connection = conn,
    cohort_database_schema = results_schema,
    cohort_table_names = table_names)

# Generating cohorts

In [None]:
cohorts_to_create = common.convert_to_r(cohorts_to_create)

cohort_generator.generate_cohort_set(
    connection_details = connection_details,
    cdm_database_schema = cdm_schema,
    cohort_database_schema = results_schema,
    cohort_table_names = table_names,
    cohort_definition_set = cohorts_to_create)

# Get cohort counts

In [None]:
cohort_counts = cohort_generator.get_cohort_counts(
    connection_details = connection_details,
    cohort_database_schema = results_schema,
    cohort_table = 'cohort')

common.convert_from_r(cohort_counts)

# Disconnect

In [None]:
database_connector.disconnect(conn)

print('Done')