In [107]:
import pandas as pd

In [108]:
data = pd.read_csv("isolates.csv")
data.head(10)

Unnamed: 0,#Organism group,Isolate,AMR genotypes,AST phenotypes
0,Listeria monocytogenes,PDT000077416.3,"fosX=COMPLETE,lin=COMPLETE","chloramphenicol=S,clindamycin=R,erythromycin=S..."
1,Listeria monocytogenes,PDT000095192.3,"fosX=COMPLETE,lin=COMPLETE","ampicillin=S,penicillin=S"
2,Salmonella enterica,PDT000003687.3,"mdsA=COMPLETE,mdsB=COMPLETE","amikacin=S,amoxicillin-clavulanic acid=S,ampic..."
3,Salmonella enterica,PDT000003688.4,"mdsA=COMPLETE,mdsB=COMPLETE","amikacin=S,amoxicillin-clavulanic acid=S,ampic..."
4,Salmonella enterica,PDT000003689.4,"mdsA=COMPLETE,mdsB=COMPLETE","amikacin=S,amoxicillin-clavulanic acid=S,ampic..."
5,Salmonella enterica,PDT000003690.3,"aph(3'')-Ib=COMPLETE,aph(6)-Id=COMPLETE,mdsA=C...","amikacin=S,amoxicillin-clavulanic acid=S,ampic..."
6,Salmonella enterica,PDT000003691.3,"mdsA=COMPLETE,mdsB=COMPLETE,tet(B)=COMPLETE","amikacin=S,amoxicillin-clavulanic acid=S,ampic..."
7,Salmonella enterica,PDT000003692.3,"mdsA=COMPLETE,mdsB=COMPLETE","amikacin=S,amoxicillin-clavulanic acid=S,ampic..."
8,Salmonella enterica,PDT000003693.3,"aph(3'')-Ib=COMPLETE,aph(6)-Id=COMPLETE,mdsA=C...","amikacin=S,amoxicillin-clavulanic acid=S,ampic..."
9,Salmonella enterica,PDT000003694.4,"fosA7=COMPLETE,mdsA=COMPLETE,mdsB=COMPLETE","amikacin=S,amoxicillin-clavulanic acid=S,ampic..."


In [109]:

def transform_dataframe(df):
    new_data = []

    for idx, row in df.iterrows():
        # split AMR genotypes and remove "=COMPLETE"
        amr_genotypes = [i.split('=')[0] for i in row['AMR genotypes'].split(',')]

        # split AST phenotypes
        ast_phenotypes = row['AST phenotypes'].split(',')

        # process each phenotype
        for pheno in ast_phenotypes:
            drug, resistance = pheno.split('=')
            if resistance == 'R':
                resistance_score = 1
            elif resistance == 'S':
                resistance_score = 0
            else:  # assuming 'I' as per your description
                resistance_score = 0.5

            # create a new row
            new_row = {'#Organism group': row['#Organism group'],
                       'Isolate': row['Isolate'],
                       'AMR genotypes': ', '.join(amr_genotypes),
                       'drug': drug,
                       'resistance': resistance_score}
            new_data.append(new_row)

    # create a new dataframe
    new_df = pd.DataFrame(new_data)

    return new_df


In [110]:
new_dataframe = transform_dataframe(data)


In [111]:
new_dataframe.head(10)

Unnamed: 0,#Organism group,Isolate,AMR genotypes,drug,resistance
0,Listeria monocytogenes,PDT000077416.3,"fosX, lin",chloramphenicol,0.0
1,Listeria monocytogenes,PDT000077416.3,"fosX, lin",clindamycin,1.0
2,Listeria monocytogenes,PDT000077416.3,"fosX, lin",erythromycin,0.0
3,Listeria monocytogenes,PDT000077416.3,"fosX, lin",gentamicin,0.0
4,Listeria monocytogenes,PDT000077416.3,"fosX, lin",levofloxacin,0.0
5,Listeria monocytogenes,PDT000077416.3,"fosX, lin",oxacillin,1.0
6,Listeria monocytogenes,PDT000077416.3,"fosX, lin",penicillin,0.0
7,Listeria monocytogenes,PDT000077416.3,"fosX, lin",rifampin,0.0
8,Listeria monocytogenes,PDT000077416.3,"fosX, lin",tetracycline,0.0
9,Listeria monocytogenes,PDT000077416.3,"fosX, lin",trimethoprim-sulfamethoxazole,0.0


In [112]:
new_dataframe.shape

(316071, 5)

In [113]:
new_dataframe.to_csv("BasicData.csv",index=False)

In [114]:
def count_unique_genotypes(df):
    unique_genotypes = set()

    for idx, row in df.iterrows():
        genotypes = row['AMR genotypes'].split(', ')
        unique_genotypes.update(genotypes)

    return len(unique_genotypes)
unique_genotype_count = count_unique_genotypes(new_dataframe)
print("Number of unique genotypes:", unique_genotype_count)

Number of unique genotypes: 1213


In [118]:
def gene_per_drug(drug, df):
    drug_df = df[df['drug'] == drug]
    unique_genes = set()
    for genes in drug_df['AMR genotypes'].str.split(', '):
        unique_genes.update(genes)
    return drug_df, unique_genes

In [126]:
import numpy as np

def gene_1_hot_encoder(df, unique_genes):
    df['Gene Array'] = None
    # Add the 'Count Ones' column to the DataFrame
    df['Count Ones'] = None
    for idx, row in df.iterrows():
        gene_array = np.zeros(len(unique_genes), dtype=int)
        genes = row['AMR genotypes'].split(', ')
        for gene in genes:
            # Find the index of each gene
            gene_index = list(unique_genes).index(gene)
            # Set the corresponding index in the array to 1
            gene_array[gene_index] = 1

        # Add the gene array as a new column to the current row in the DataFrame
        df.at[idx, 'Gene Array'] = gene_array.tolist()

        # Count the number of '1' values in the gene_array
        count_ones = np.count_nonzero(gene_array)

        # Add the count as a new column to the current row in the DataFrame
        df.at[idx, 'Count Ones'] = count_ones

    return df.reset_index()

In [127]:
oxacillin_df,oxacillin_genes =   gene_per_drug('oxacillin', new_dataframe)

In [128]:
gene_1_hot_encoder(oxacillin_df, oxacillin_genes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Gene Array'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Count Ones'] = None


Unnamed: 0,#Organism group,Isolate,AMR genotypes,drug,resistance,Gene Array,Count Ones
5,Listeria monocytogenes,PDT000077416.3,"fosX, lin",oxacillin,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2
202266,E.coli and Shigella,PDT001126241.1,"acrF, aph(3'')-Ib, aph(6)-Id, blaCTX-M-55, bla...",oxacillin,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",17
202279,E.coli and Shigella,PDT001126242.1,"aac(3)-IIe, aac(6')-Ib-cr5, aadA5, acrF, blaCT...",oxacillin,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",22
202292,E.coli and Shigella,PDT001126243.1,"aadA5, acrF, blaEC, blaTEM-1, catA1, cyaA_S352...",oxacillin,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",12
233973,E.coli and Shigella,PDT001398459.1,"acrF, blaCTX-M-27, blaEC, glpT_E448K, gyrA_D87...",oxacillin,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",13
...,...,...,...,...,...,...,...
310860,Staphylococcus aureus,PDT001785232.1,"abc-f, erm(C), fosB, gyrA_S84L, mecA, mecR1, m...",oxacillin,1.0,"[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...",11
314382,Streptococcus pneumoniae,PDT001128122.1,"pbp2b, pbp2x, pmrA",oxacillin,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3
314420,Pluralibacter gergoviae,PDT001767548.1,"aac(6')-Ib-cr5, aac(6')-Ib, aadA1, aph(3')-VI,...",oxacillin,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",25
314504,Stenotrophomonas maltophilia,PDT001398931.1,"aph(3')-IIc, aph(6), arr, blaL1, blaL2, emrA, ...",oxacillin,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9


In [135]:
oxacillin_df= oxacillin_df.reset_index()

In [144]:
gene_array = oxacillin_df[oxacillin_df['Isolate'] == 'PDT001128216.2']['Gene Array'].values[0]
length = len(gene_array)
print(length)


211
