In [5]:
import pandas as pd

# Leer los archivos 'drug_atc' (CID - ATC) y 'drug_names' (ATC - name), y combinar ambos ordenando con base en el CID

drug_atc = pd.read_csv("drug_atc.tsv", sep="\t")
drug_names = pd.read_csv("drug_names.tsv", sep="\t")

combined_df = pd.merge(drug_atc, drug_names, on="cid")

# Mantener aquellas filas donde el ATC inicie con N06A (antidepressants)

antidepressants = combined_df[combined_df["atc"].str.startswith("N06A")]

display(antidepressants)

# Guardar archivo

antidepressants.to_csv("antidepressants.tsv", sep="\t", index=False)

Unnamed: 0,cid,atc,pert_iname
51,CID100000444,N06AX12,bupropion
159,CID100002160,N06AA09,amitriptyline
165,CID100002170,N06AA17,amoxapine
307,CID100002771,N06AB04,citalopram
308,CID100002771,N06AB10,citalopram
319,CID100002801,N06AA04,clomipramine
355,CID100002995,N06AA01,desipramine
419,CID100003155,N06AA16,dothiepin
422,CID100003158,N06AA12,doxepin
493,CID100003386,N06AB03,fluoxetine


In [6]:
# Definir una función para combinar los dfs de los archivos de perturbaciones, experimentos y líneas celulares

def add_LINCS_info(antidepressants, pert_info, inst_info, cell_info):

    # Añadir el pert_id de los antidepresivos al archivo anterior, basado en el nombre de la perturbación (pert_iname = name)
    antidepressants = pd.merge(antidepressants, pert_info[["pert_id", "pert_iname"]], on="pert_iname")

    # Contar el número de perturbaciones por cada pert_id (NOTA: un pert_iname puede tener múltiples pert_ids asociados), y añadir la columna
    name_counts = pert_info.groupby("pert_iname")["pert_id"].count().reset_index()
    name_counts.rename(columns={"pert_id": "pert_count"}, inplace=True)
    antidepressants = pd.merge(antidepressants, name_counts, on="pert_iname")

    # Contar el número de líneas celulares por cada pert_id (NOTA: un pert_id puede tener múltiples líneas celulares asociadas), y añadir la columna
    cell_line_counts = inst_info.groupby("pert_id")["cell_id"].nunique().reset_index()
    cell_line_counts.rename(columns={"cell_id": "cell_id_count"}, inplace=True)
    antidepressants = pd.merge(antidepressants, cell_line_counts, on="pert_id")

    # Añadir la información del archivo de experimentos, basado en el id de la perturbación (pert_id)
    antidepressants = pd.merge(antidepressants, inst_info.drop(columns=["pert_iname"]), on="pert_id")

    # Añadir la información del archivo de células, basado en el id de la línea celular (cell_id)
    antidepressants = pd.merge(antidepressants, cell_info, on="cell_id")

    return antidepressants

# Fase 1 (GSE92742)

In [7]:
# Leer los archivos necesarios para la fase 1

pert_info = pd.read_csv("GSE92742_Broad_LINCS_pert_info.txt", sep="\t")
inst_info = pd.read_csv("GSE92742_Broad_LINCS_inst_info.txt", sep="\t")
cell_info = pd.read_csv("GSE92742_Broad_LINCS_cell_info.txt", sep="\t")

# Emplear la función definida para crear un df con los datos de la fase 1

antidepressants_1 = add_LINCS_info(antidepressants, pert_info, inst_info, cell_info)

display(antidepressants_1)

# Guardar el archivo con los datos de la fase 1

antidepressants_1.to_csv("LINCS_ad_phase1.tsv", sep="\t", index=False)

  inst_info = pd.read_csv("GSE92742_Broad_LINCS_inst_info.txt", sep="\t")


Unnamed: 0,cid,atc,pert_iname,pert_id,pert_count,cell_id_count,inst_id,rna_plate,rna_well,pert_type,...,modification,sample_type,primary_site,subtype,original_growth_pattern,provider_catalog_id,original_source_vendor,donor_age,donor_sex,donor_ethnicity
0,CID100000444,N06AX12,bupropion,BRD-A05186015,1,13,CPC004_A375_6H_X1_B3_DUO52HI53LO:J08,CPC004_A375_6H_X1,J08,trt_cp,...,-666,tumor,skin,malignant melanoma,adherent,CRL-1619,ATCC,54,F,-666
1,CID100000444,N06AX12,bupropion,BRD-A05186015,1,13,CPC004_A375_6H_X2_B3_DUO52HI53LO:J08,CPC004_A375_6H_X2,J08,trt_cp,...,-666,tumor,skin,malignant melanoma,adherent,CRL-1619,ATCC,54,F,-666
2,CID100000444,N06AX12,bupropion,BRD-A05186015,1,13,CPC004_A375_6H_X3_B3_DUO52HI53LO:J08,CPC004_A375_6H_X3,J08,trt_cp,...,-666,tumor,skin,malignant melanoma,adherent,CRL-1619,ATCC,54,F,-666
3,CID100000444,N06AX12,bupropion,BRD-A05186015,1,13,CPC015_A375_6H_X1_B4_DUO52HI53LO:L09,CPC015_A375_6H_X1,L09,trt_cp,...,-666,tumor,skin,malignant melanoma,adherent,CRL-1619,ATCC,54,F,-666
4,CID100000444,N06AX12,bupropion,BRD-A05186015,1,13,CPC015_A375_6H_X2_B4_DUO52HI53LO:L09,CPC015_A375_6H_X2,L09,trt_cp,...,-666,tumor,skin,malignant melanoma,adherent,CRL-1619,ATCC,54,F,-666
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3936,CID100005530,N06AF04,tranylcypromine,BRD-K88809146,3,5,NMH002_FIBRNPC_6H_X3_B6_DUO52HI53LO:H08,NMH002_FIBRNPC_6H_X3,H08,trt_cp,...,-666,normal,skin,-666,adherent,-666,-666,-666,-666,-666
3937,CID100065856,N06AX18,reboxetine,BRD-K32814891,3,3,NMH001_FIBRNPC_6H_X1_B6_DUO52HI53LO:N11,NMH001_FIBRNPC_6H_X1,N11,trt_cp,...,-666,normal,skin,-666,adherent,-666,-666,-666,-666,-666
3938,CID100065856,N06AX18,reboxetine,BRD-K32814891,3,3,NMH001_FIBRNPC_24H_X1_B6_DUO52HI53LO:N11,NMH001_FIBRNPC_24H_X1,N11,trt_cp,...,-666,normal,skin,-666,adherent,-666,-666,-666,-666,-666
3939,CID100065856,N06AX18,reboxetine,BRD-K32814891,3,3,NMH001_FIBRNPC_24H_X3_B6_DUO52HI53LO:N11,NMH001_FIBRNPC_24H_X3,N11,trt_cp,...,-666,normal,skin,-666,adherent,-666,-666,-666,-666,-666


# Fase 2 (GSE70138)

In [8]:
# Leer los archivos necesarios para la fase 2

pert_info = pd.read_csv("GSE70138_Broad_LINCS_pert_info.txt", sep="\t")
inst_info = pd.read_csv("GSE70138_Broad_LINCS_inst_info.txt", sep="\t")
cell_info = pd.read_csv("GSE70138_Broad_LINCS_cell_info.txt", sep="\t")

# Emplear la función definida para crear un df con los datos de la fase 2

antidepressants_2 = add_LINCS_info(antidepressants, pert_info, inst_info, cell_info)

display(antidepressants_2)

# Guardar el archivo con los datos de la fase 2

antidepressants_2.to_csv("LINCS_ad_phase2.tsv", sep="\t", index=False)

Unnamed: 0,cid,atc,pert_iname,pert_id,pert_count,cell_id_count,inst_id,cell_id,det_plate,det_well,...,modification,sample_type,primary_site,subtype,original_growth_pattern,provider_catalog_id,original_source_vendor,donor_age,donor_sex,donor_ethnicity
0,CID100000444,N06AX12,bupropion,BRD-A05186015,1,7,REP.A011_A375_24H_X1_B24:F01,A375,REP.A011_A375_24H_X1_B24,F01,...,-666,tumor,skin,malignant melanoma,adherent,CRL-1619,ATCC,54,F,-666
1,CID100000444,N06AX12,bupropion,BRD-A05186015,1,7,REP.A011_A375_24H_X1_B24:F02,A375,REP.A011_A375_24H_X1_B24,F02,...,-666,tumor,skin,malignant melanoma,adherent,CRL-1619,ATCC,54,F,-666
2,CID100000444,N06AX12,bupropion,BRD-A05186015,1,7,REP.A011_A375_24H_X1_B24:F03,A375,REP.A011_A375_24H_X1_B24,F03,...,-666,tumor,skin,malignant melanoma,adherent,CRL-1619,ATCC,54,F,-666
3,CID100000444,N06AX12,bupropion,BRD-A05186015,1,7,REP.A011_A375_24H_X1_B24:F04,A375,REP.A011_A375_24H_X1_B24,F04,...,-666,tumor,skin,malignant melanoma,adherent,CRL-1619,ATCC,54,F,-666
4,CID100000444,N06AX12,bupropion,BRD-A05186015,1,7,REP.A011_A375_24H_X1_B24:F05,A375,REP.A011_A375_24H_X1_B24,F05,...,-666,tumor,skin,malignant melanoma,adherent,CRL-1619,ATCC,54,F,-666
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3649,CID100005530,N06AF04,tranylcypromine,BRD-K47029922,1,19,LJP008_SKL_24H_X3_B21:N08,SKL,LJP008_SKL_24H_X3_B21,N08,...,-666,primary,muscle,normal primary skeletal muscle cells,adherent,CC-2561,LONZA,-666,-666,-666
3650,CID100005530,N06AF04,tranylcypromine,BRD-K47029922,1,19,LJP008_SKL_24H_X3_B21:N09,SKL,LJP008_SKL_24H_X3_B21,N09,...,-666,primary,muscle,normal primary skeletal muscle cells,adherent,CC-2561,LONZA,-666,-666,-666
3651,CID100005530,N06AF04,tranylcypromine,BRD-K47029922,1,19,LJP008_SKL_24H_X3_B21:N10,SKL,LJP008_SKL_24H_X3_B21,N10,...,-666,primary,muscle,normal primary skeletal muscle cells,adherent,CC-2561,LONZA,-666,-666,-666
3652,CID100005530,N06AF04,tranylcypromine,BRD-K47029922,1,19,LJP008_SKL_24H_X3_B21:N11,SKL,LJP008_SKL_24H_X3_B21,N11,...,-666,primary,muscle,normal primary skeletal muscle cells,adherent,CC-2561,LONZA,-666,-666,-666
