In [4]:
import pandas as pd
import numpy as np
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

2023-07-17 17:17:07.603220: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
def clean_gene(df, column):
    # Copy the dataframe to avoid modifying the original
    df = df.copy()

    # Replace hyphen '-' and '(' ')' with nothing
    df[column] = df[column].str.replace(r'[-\(\)\'\?\_]', '', regex=True)

    # Convert all upper case to lower case
    df[column] = df[column].str.lower()

    return df


In [6]:
val_data = pd.read_csv("Referece_gene_catalogue_resistance_amr_validation.csv")
val_data.head(10)

Unnamed: 0,Gene family,Class,Subclass
0,aac(2')-Ia,AMINOGLYCOSIDE,GENTAMICIN/TOBRAMCYIN
1,aac(2')-Ib,AMINOGLYCOSIDE,GENTAMICIN/TOBRAMCYIN
2,aac(2')-Ic,AMINOGLYCOSIDE,GENTAMICIN/TOBRAMCYIN
3,aac(2')-Id,AMINOGLYCOSIDE,GENTAMICIN/TOBRAMCYIN
4,aac(2')-Ie,AMINOGLYCOSIDE,GENTAMICIN/TOBRAMCYIN
5,aac(2')-IIa,AMINOGLYCOSIDE,KASUGAMYCIN
6,aac(2')-IIb,AMINOGLYCOSIDE,KASUGAMYCIN
7,aac(3)-I,AMINOGLYCOSIDE,GENTAMICIN
8,aac(3)-I,AMINOGLYCOSIDE,GENTAMICIN
9,aac(3)-I,AMINOGLYCOSIDE,GENTAMICIN


In [7]:
val_data.drop('Class', axis=1, inplace=True)
val_data.drop_duplicates(inplace=True)
val_data['Subclass'] = val_data['Subclass'].str.lower()
val_data['Subclass'] = val_data['Subclass'].str.split('/')
val_data = val_data.explode('Subclass')
val_data['Gene family'] = val_data['Gene family'].str.split('/')
val_data = val_data.explode('Gene family')
val_data.drop_duplicates(inplace=True)
val_data.dropna(inplace=True)
val_data = val_data.reset_index()
val_data = clean_gene(val_data,'Gene family')
val_data

Unnamed: 0,index,Gene family,Subclass
0,0,aac2ia,gentamicin
1,0,aac2ia,tobramcyin
2,1,aac2ib,gentamicin
3,1,aac2ib,tobramcyin
4,2,aac2ic,gentamicin
...,...,...,...
1247,6668,vmlr,lincosamide
1248,6668,vmlr,streptogramin
1249,6668,vmlr,tiamulin
1250,6669,vph,viomycin


In [8]:
val_data.to_csv("val_data.csv", index=False)

In [9]:
data = pd.read_csv(r'isolates.csv')
data.head(10)

Unnamed: 0,#Organism group,Isolate,AMR genotypes,AST phenotypes
0,Listeria monocytogenes,PDT000077416.3,"fosX=COMPLETE,lin=COMPLETE","chloramphenicol=S,clindamycin=R,erythromycin=S..."
1,Listeria monocytogenes,PDT000095192.3,"fosX=COMPLETE,lin=COMPLETE","ampicillin=S,penicillin=S"
2,Salmonella enterica,PDT000003687.3,"mdsA=COMPLETE,mdsB=COMPLETE","amikacin=S,amoxicillin-clavulanic acid=S,ampic..."
3,Salmonella enterica,PDT000003688.4,"mdsA=COMPLETE,mdsB=COMPLETE","amikacin=S,amoxicillin-clavulanic acid=S,ampic..."
4,Salmonella enterica,PDT000003689.4,"mdsA=COMPLETE,mdsB=COMPLETE","amikacin=S,amoxicillin-clavulanic acid=S,ampic..."
5,Salmonella enterica,PDT000003690.3,"aph(3'')-Ib=COMPLETE,aph(6)-Id=COMPLETE,mdsA=C...","amikacin=S,amoxicillin-clavulanic acid=S,ampic..."
6,Salmonella enterica,PDT000003691.3,"mdsA=COMPLETE,mdsB=COMPLETE,tet(B)=COMPLETE","amikacin=S,amoxicillin-clavulanic acid=S,ampic..."
7,Salmonella enterica,PDT000003692.3,"mdsA=COMPLETE,mdsB=COMPLETE","amikacin=S,amoxicillin-clavulanic acid=S,ampic..."
8,Salmonella enterica,PDT000003693.3,"aph(3'')-Ib=COMPLETE,aph(6)-Id=COMPLETE,mdsA=C...","amikacin=S,amoxicillin-clavulanic acid=S,ampic..."
9,Salmonella enterica,PDT000003694.4,"fosA7=COMPLETE,mdsA=COMPLETE,mdsB=COMPLETE","amikacin=S,amoxicillin-clavulanic acid=S,ampic..."


In [10]:
anti_family = pd.read_csv("gene_anti_family.csv")
anti_family.head(10)

  anti_family = pd.read_csv("gene_anti_family.csv")


Unnamed: 0,gene family,gene,antibiotic,drug_class,S,R
0,ANT(2''),ANT(2'')-Ia,spectinomycin,aminoglycoside,,1.0
1,ANT(2''),ANT(2'')-Ia,trimethoprim-sulfamethoxazole,sulfonamide,,
2,ANT(2''),ANT(2'')-Ia,azithromycin,macrolide,,
3,ANT(2''),ANT(2'')-Ia,ceftazidime-avibactam,beta-lactamase,,
4,ANT(2''),ANT(2'')-Ia,piperacillin,Penicillin,,
5,ANT(2''),ANT(2'')-Ia,ticarcillin,Penicillin,,
6,ANT(2''),ANT(2'')-Ia,kanamycin,Aminoglycoside,,1.0
7,ANT(2''),ANT(2'')-Ia,oxacillin,Penicillin,,
8,ANT(2''),ANT(2'')-Ia,tedizolid,Oxazolidinone,,
9,ANT(2''),ANT(2'')-Ia,cefiderocol,beta-lactamase,,


In [11]:
anti_family = clean_gene(anti_family,'gene')
anti_family

Unnamed: 0,gene family,gene,antibiotic,drug_class,S,R
0,ANT(2''),ant2ia,spectinomycin,aminoglycoside,,1.0
1,ANT(2''),ant2ia,trimethoprim-sulfamethoxazole,sulfonamide,,
2,ANT(2''),ant2ia,azithromycin,macrolide,,
3,ANT(2''),ant2ia,ceftazidime-avibactam,beta-lactamase,,
4,ANT(2''),ant2ia,piperacillin,Penicillin,,
...,...,...,...,...,...,...
264379,,mcr2.3,ceftazidime,beta-lactamase,,
264380,,mcr2.3,amoxicillin-clavulanic acid,beta-lactamase,,
264381,,mcr2.3,streptomycin,Aminoglycoside,,
264382,,mcr2.3,moxifloxacin,Quinolone,,


In [12]:
def transform_dataframe(df):
    new_data = []

    for idx, row in df.iterrows():
        # split AMR genotypes and remove "=COMPLETE"
        amr_genotypes = [i.split('=')[0] for i in row['AMR genotypes'].split(',')]

        # split AST phenotypes
        ast_phenotypes = row['AST phenotypes'].split(',')

        # process each phenotype
        for pheno in ast_phenotypes:
            drug, resistance = pheno.split('=')
            if resistance == 'R':
                resistance_score = 1
            elif resistance == 'S':
                resistance_score = 0
            else:  # assuming 'I' as per your description
                resistance_score = 0.5

            # create a new row
            new_row = {'#Organism group': row['#Organism group'],
                       'Isolate': row['Isolate'],
                       'AMR genotypes': ', '.join(amr_genotypes),
                       'drug': drug,
                       'resistance': resistance_score}
            new_data.append(new_row)

    # create a new dataframe
    new_df = pd.DataFrame(new_data)

    return new_df


In [13]:
df = transform_dataframe(data)


In [14]:
df.to_csv("BasicData.csv",index=False)
df.head(10)

Unnamed: 0,#Organism group,Isolate,AMR genotypes,drug,resistance
0,Listeria monocytogenes,PDT000077416.3,"fosX, lin",chloramphenicol,0.0
1,Listeria monocytogenes,PDT000077416.3,"fosX, lin",clindamycin,1.0
2,Listeria monocytogenes,PDT000077416.3,"fosX, lin",erythromycin,0.0
3,Listeria monocytogenes,PDT000077416.3,"fosX, lin",gentamicin,0.0
4,Listeria monocytogenes,PDT000077416.3,"fosX, lin",levofloxacin,0.0
5,Listeria monocytogenes,PDT000077416.3,"fosX, lin",oxacillin,1.0
6,Listeria monocytogenes,PDT000077416.3,"fosX, lin",penicillin,0.0
7,Listeria monocytogenes,PDT000077416.3,"fosX, lin",rifampin,0.0
8,Listeria monocytogenes,PDT000077416.3,"fosX, lin",tetracycline,0.0
9,Listeria monocytogenes,PDT000077416.3,"fosX, lin",trimethoprim-sulfamethoxazole,0.0


In [15]:
df = clean_gene(df, 'AMR genotypes')
df

Unnamed: 0,#Organism group,Isolate,AMR genotypes,drug,resistance
0,Listeria monocytogenes,PDT000077416.3,"fosx, lin",chloramphenicol,0.0
1,Listeria monocytogenes,PDT000077416.3,"fosx, lin",clindamycin,1.0
2,Listeria monocytogenes,PDT000077416.3,"fosx, lin",erythromycin,0.0
3,Listeria monocytogenes,PDT000077416.3,"fosx, lin",gentamicin,0.0
4,Listeria monocytogenes,PDT000077416.3,"fosx, lin",levofloxacin,0.0
...,...,...,...,...,...
316066,Enterobacter roggenkampii,PDT000898827.2,"blamir, cata, fosa, mcr10.1, oqxa, oqxb",tetracycline,0.0
316067,Enterobacter roggenkampii,PDT000898827.2,"blamir, cata, fosa, mcr10.1, oqxa, oqxb",trimethoprim-sulfamethoxazole,0.0
316068,Enterobacter roggenkampii,PDT001161812.2,"blamir16, cata, fosa, oqxb",ertapenem,0.0
316069,Enterobacter roggenkampii,PDT001161812.2,"blamir16, cata, fosa, oqxb",imipenem,0.5


In [16]:
df.to_csv("BasicData_clean.csv",index=False)

In [17]:
def create_empty_gene_antibiotic_df(df,unique_all_genes,unique_all_antibiotics):
    for idx, row in df.iterrows():
        genotypes = row['AMR genotypes'].split(', ')
        antibiotic = row['drug']

        unique_all_genes.extend(genotypes)
        unique_all_antibiotics.append(antibiotic)

    unique_all_genes = list(set(unique_all_genes))  # Convert to list
    unique_all_antibiotics = list(set(unique_all_antibiotics))  # Convert to list

    gene_antibiotic_df = pd.DataFrame(index=unique_all_genes, columns=unique_all_antibiotics)
    gene_antibiotic_df = gene_antibiotic_df.fillna(-1)

    return gene_antibiotic_df

# Assuming your DataFrame is named 'new_dataframe'
unique_all_genes = []
unique_all_antibiotics = []
gene_antibiotic_df = create_empty_gene_antibiotic_df(df,unique_all_genes,unique_all_antibiotics)
gene_antibiotic_df


Unnamed: 0,ceftizoxime,oxytetracycline,cefpodoxime,cefamandole,piperacillin,mecillinam,ticarcillin-clavulanic acid,nalidixic acid,cefazolin,gentamicin,...,linezolid,meropenem-vaborbactam,ertapenem,ceftazidime-avibactam,trimethoprim-sulfamethoxazole,florfenicol,chlortetracycline,benzylpenicillin,doxycycline,norfloxacin
qnrb10,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
cml,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
blaoxy26,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
blaadc159,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
blalen9,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
blaoxa129,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
lin,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
blaoxa64,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
blaokpb3,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [18]:
gene_antibiotic_df.to_csv("Empty_Gene_Antibiotic.csv", index=True, index_label=False)

***********************************************

### run from here

In [34]:
# def gene_per_drug(drug):
#     drug_df = df[df['drug'] == drug]
#     unique_genes = set()
#     for genes in drug_df['AMR genotypes'].str.split(', '):
#         unique_genes.update(genes)
#     return drug_df, unique_genes

In [108]:
df = pd.read_csv('BasicData_clean.csv')
val_data = pd.read_csv('val_data.csv')
gene_antibiotic_df = pd.read_csv('Empty_Gene_Antibiotic.csv')
anti_family = clean_gene(anti_family,'gene')

In [109]:
gene_antibiotic_df

Unnamed: 0,ticarcillin,ceftizoxime,netilmicin,ceftazidime,tobramycin,meropenem,levofloxacin,amoxicillin-clavulanic acid,polymyxin B,benzylpenicillin,...,colistin,amoxicillin,spectinomycin,oxacillin,cefotetan,plazomicin,enrofloxacin,doripenem,cefalexin,nitrofurantoin
50sl22g86e,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
blaadc99,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
blaact70,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
blatem214,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
blaoxy27,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mefh,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
blaadc216,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
blaoxa421,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
blaoxa374,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [134]:
def gene_per_drug(drug):
    drug_df = df[df['drug'] == drug]
    unique_genes = set()
    for genes in drug_df['AMR genotypes'].str.split(', '):
        unique_genes.update(genes)

    # Get unique #Organism group values
    organism_groups = drug_df['#Organism group'].unique()

    # Initialize empty lists for train_df and test_df
    train_dfs = []
    test_dfs = []

    # Iterate over #Organism groups
    for group in organism_groups:
        group_df = drug_df[drug_df['#Organism group'] == group]

        if len(group_df) > 1:
            group_train_df, group_test_df = train_test_split(group_df, test_size=0.2, random_state=42)
        else:
            group_train_df = group_df
            group_test_df = group_df

        train_dfs.append(group_train_df)
        test_dfs.append(group_test_df)

    # Concatenate the train and test dataframes for all groups
    train_df = pd.concat(train_dfs)
    test_df = pd.concat(test_dfs)

    return train_df, test_df, unique_genes

# Rest of the code remains the same


In [135]:
def relevant_val_df(drug,unique_genes):
    relevant_df = val_data[(val_data['Subclass'] == drug) & (val_data['Gene family'].isin(unique_genes))]
    if relevant_df.shape[0]>1 :
        val_train, val_test = train_test_split(relevant_df, test_size=0.2, random_state=42)
        return val_train, val_test
    return None, None

In [19]:
# def create_gene_df(drug_df,val_df, unique_genes):
#     gene_arrays = []
#     labels = []
#
#     for idx, row in drug_df.iterrows():
#         gene_array = np.zeros(len(unique_genes), dtype=int)
#         genes = row['AMR genotypes'].split(', ')
#         for gene in genes:
#             gene_index = list(unique_genes).index(gene)
#             gene_array[gene_index] = 1
#
#         gene_arrays.append(gene_array.tolist())
#         labels.append(row['resistance'])
#
#     gene_df = pd.DataFrame(gene_arrays, columns=unique_genes)
#     gene_df['label'] = labels
#
#     if val_df is not None:
#         for idx, row in val_df.iterrows():
#             gene_array = np.zeros(len(unique_genes), dtype=int)
#             gene = row['Gene family']
#             gene_index = list(unique_genes).index(gene)
#             gene_array[gene_index] = 1
#
#             gene_arrays.append(gene_array.tolist())
#             labels.append(1)
#
#         gene_df = pd.DataFrame(gene_arrays, columns=unique_genes)
#         gene_df['label'] = labels
#
#     return gene_df.sample(frac=1).reset_index(drop=True)

In [209]:

def create_update_gene_df(drug_df, val_df, unique_genes, drug, zero_col):
    gene_arrays = []
    labels = []
    drug_r = set(anti_family.loc[(anti_family['antibiotic'] == drug) & (anti_family['R'] == 1), 'gene'].tolist())
    for _, row in drug_df.iterrows():
        genes = row['AMR genotypes'].split(', ')
        label = row['resistance']
        gene_array = np.zeros(len(unique_genes), dtype=int)

        for gene in genes:
            gene_index = list(unique_genes).index(gene)
            if gene in drug_r:
                gene_array[gene_index] = 1

        gene_arrays.append(gene_array.tolist())
        labels.append(label)

    gene_df = pd.DataFrame(gene_arrays, columns=unique_genes)
    gene_df['label'] = labels

    if val_df is not None:
        for idx, row in val_df.iterrows():
            gene_array = np.zeros(len(unique_genes), dtype=int)
            gene = row['Gene family']
            gene_index = list(unique_genes).index(gene)
            gene_array[gene_index] = 1

            gene_arrays.append(gene_array.tolist())
            labels.append(1)

        gene_df = pd.DataFrame(gene_arrays, columns=unique_genes)
        gene_df['label'] = labels


    if len(zero_col) == 0:
        zero_col = [col for col in gene_df.columns if col not in drug_r and col != 'label']
    else:
        zero_col = [col for col in zero_col if col not in drug_r and col != 'label']
    gene_df = gene_df.drop(zero_col, axis=1)

    return gene_df.sample(frac=1).reset_index(drop=True), zero_col

In [138]:
# def update_df(df, drug, anti_family, zero_col):
#     # Get the list of genes for which the antibiotic is the specified drug
#     drug_r = anti_family.loc[(anti_family['antibiotic'] == drug) & (anti_family['R'] == 1), 'gene'].tolist()
#     drug_s = anti_family.loc[(anti_family['antibiotic'] == drug) & (anti_family['R'] != 1) & (anti_family['S'] == 's'), 'gene'].tolist()
#
#     # Iterate over each row
#     for index, row in df.iterrows():
#         # Iterate over each column
#         for column in df.columns:
#             gene = str(column)
#             if gene == "label":
#                 continue
#             # Check if the gene is not in the list of drug genes and label is 1 or 0.5
#             if ((gene not in drug_r) or (gene in drug_s)) and (row['label'] == 1):
#                 # Set the value to 0 if the condition is met
#                 df.loc[index, column] = 0
#
#             # Check if the gene is in the list of drug genes and label is 0
#             if (gene in drug_r) and (row['label'] == 0):
#                 df.loc[index, column] = 0
#
#             if (gene in drug_s) and (row['label'] == 0):
#                 df.loc[index, column] = 1
#
#
#
#     if len(zero_col)==0:
#         zero_col = df.columns[df.eq(0).all(axis=0)]
#     df = df.drop(zero_col, axis=1)
#     return df, zero_col


In [139]:
def Model_deep(drug_train_for_model, drug_test_for_model, epochs, batch_size):
    # Assuming your DataFrame with gene features and labels is named `oxacillin_df_for_model`
    # Split the data into training features (genes) and labels
    X_train = drug_train_for_model.drop('label', axis=1).values
    y_train = drug_train_for_model['label'].values

    # Split the data into testing features (genes) and labels
    X_test = drug_test_for_model.drop('label', axis=1).values
    y_test = drug_test_for_model['label'].values

    # Define the neural network model
    model = keras.Sequential()
    model.add(keras.layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(keras.layers.Dense(64, activation='relu'))
    model.add(keras.layers.Dense(16, activation='relu'))
    model.add(keras.layers.Dense(1, activation='sigmoid'))  # Sigmoid activation for probability between 0 and 1

    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['mse'])

    # Train the model
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)

    # Evaluate the model
    loss, mse = model.evaluate(X_test, y_test)
    pred = model.predict(X_test)
    print('Loss:', loss)
    print('Mean Squared Error:', mse)
    print('len of pred:', pred.shape, 'y_test:', y_test.shape)

    # Make predictions
    return y_test, pred, model


In [140]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

def Model(drug_train_for_model, drug_test_for_model):
    # Split the data into training features (genes) and labels
    X_train = drug_train_for_model.drop('label', axis=1).values
    y_train = drug_train_for_model['label'].values

    # Split the data into testing features (genes) and labels
    X_test = drug_test_for_model.drop('label', axis=1).values
    y_test = drug_test_for_model['label'].values

    # Initialize the model
    model = RandomForestRegressor(n_estimators=100, random_state=0)

    # Train the model
    model.fit(X_train, y_train)

    # Predict the resistance score
    y_pred = model.predict(X_test)

    # Compute mean squared error
    mse = mean_squared_error(y_test, y_pred)

    print('Mean Squared Error:', mse)
    print('len of pred:', y_pred.shape, 'y_test:', y_test.shape)

    # Make predictions
    return y_test, y_pred, model

In [141]:
def evaluation(y_test, predictions, drug):
    # Calculate mean squared error
    mse = mean_squared_error(y_test, predictions)

    # Calculate mean absolute error
    mae = mean_absolute_error(y_test, predictions)

    # Calculate R^2 score
    r2 = r2_score(y_test, predictions)

    
    print('Mean Squared Error:', mse)
    print('Mean Absolute Error:', mae)
    print('R^2 Score:', r2)
    return mse, mae, r2


In [142]:
def Create_df_one_gene(drug_df_for_model):
    # Extract the column names (excluding the label column)
    column_names = drug_df_for_model.columns[:-1]

    # Create a new DataFrame with zeros
    df_one_gene = pd.DataFrame(0, index=np.arange(len(column_names)), columns=column_names)

    # Set the value at the corresponding index position in each row
    for i in range(len(column_names)):
        df_one_gene.iloc[i, i] = 1

    # Display the new DataFrame
    return df_one_gene

In [204]:
# it will save the drugs with empty df_gene
problematic_drugs = []

In [212]:
def PredR_Antibiotic(drug):
    print('************************\n', drug)
    zero_lst = []
    drug_r = anti_family.loc[(anti_family['antibiotic'] == drug) & (anti_family['R'] == 1), 'gene'].tolist()
    train_df, test_df,drug_genes =   gene_per_drug(drug)
    val_train, val_test= relevant_val_df(drug, drug_genes)
    drug_train_for_model, zero_lst = create_update_gene_df(train_df, val_train, list(drug_genes), drug, zero_lst)
    drug_test_for_model, zero_lst = create_update_gene_df(test_df, val_test, list(drug_genes), drug, zero_lst)
    # Combine train and test dataframes
    combined_df = pd.concat([drug_train_for_model, drug_test_for_model], ignore_index=True)

    y_test, predictions, model = Model_deep(drug_train_for_model, drug_test_for_model, 40, 10)
    mse, mae, r2 = evaluation(y_test, predictions, drug)
    df_gene = Create_df_one_gene(combined_df)
    if df_gene.shape[1]>1:
     pred = model.predict(df_gene)
    else:
        problematic_drugs.append(drug)

    for col in drug_genes:
        if (df_gene.shape[1]>1) and (col in drug_r):
         gene_antibiotic_df.loc[col, drug] = pred[df_gene.columns.get_loc(col)]
        else:
          gene_antibiotic_df.loc[col, drug] = 0.0

In [2]:
# print('************************\n', 'amoxicillin-clavulanic acid')
# drug = 'amoxicillin-clavulanic acid'
# drug_s = anti_family.loc[(anti_family['antibiotic'] == drug) & (anti_family['R'] != 1) & (anti_family['S'] == 'S'), 'gene'].tolist()
# drug_r = anti_family.loc[(anti_family['antibiotic'] == drug) & (anti_family['R'] == 1), 'gene'].tolist()
# zero_lst = []
# train_df, test_df,drug_genes =   gene_per_drug(drug)
# val_train, val_test= relevant_val_df(drug, drug_genes)
# drug_train_for_model, zero_lst= create_update_gene_df(train_df, val_train, list(drug_genes), drug, zero_lst)
# drug_test_for_model, zero_lst= create_update_gene_df(test_df, val_test, list(drug_genes), drug, zero_lst)
#
# combined_df = pd.concat([drug_train_for_model, drug_test_for_model], ignore_index=True)
#
# y_test, predictions, model = Model_deep(drug_train_for_model, drug_test_for_model,30,10)
# mse, mae, r2 = evaluation(y_test, predictions, drug)
# df_gene = Create_df_one_gene(combined_df)
# if df_gene.shape[1]>1:
#  pred = model.predict(df_gene)
#
# for col in drug_genes:
#     if ( df_gene.shape[1] >1) and (col in drug_r):
#      gene_antibiotic_df.loc[col, drug] = pred[df_gene.columns.get_loc(col)]
#     else:
#      gene_antibiotic_df.loc[col, drug] = 0.0

In [3]:
# print('************************\n', 'ciprofloxacin')
# drug = 'ciprofloxacin'
# drug_s = anti_family.loc[(anti_family['antibiotic'] == drug) & (anti_family['R'] != 1) & (anti_family['S'] == 'S'), 'gene'].tolist()
# drug_r = anti_family.loc[(anti_family['antibiotic'] == drug) & (anti_family['R'] == 1), 'gene'].tolist()
# zero_lst = []
# train_df, test_df,drug_genes =   gene_per_drug(drug)
# val_train, val_test= relevant_val_df(drug, drug_genes)
# drug_train_for_model, zero_lst, r1 = create_update_gene_df(train_df, val_train, list(drug_genes), drug, zero_lst)
# drug_test_for_model, zero_lst, r2 = create_update_gene_df(test_df, val_test, list(drug_genes), drug, zero_lst)
#
# combined_df = pd.concat([drug_train_for_model, drug_test_for_model], ignore_index=True)
#
# y_test, predictions, model = Model_deep(drug_train_for_model, drug_test_for_model,30,10)
# mse, mae, r2 = evaluation(y_test, predictions, drug)
# df_gene = Create_df_one_gene(combined_df)
# if len(drug_r) !=0:
#  pred = model.predict(df_gene)
#
# for col in drug_genes:
#     if (len(drug_r) !=0) and (col in drug_r):
#      gene_antibiotic_df.loc[col, drug] = pred[df_gene.columns.get_loc(col)]
#     else:
#      gene_antibiotic_df.loc[col, drug] = 0.0

In [213]:
antibiotic_few_samples = []
antibiotic_done = []
for antibiotic in unique_all_antibiotics:
    if len(df[df['drug'] == antibiotic]) < 50:
        antibiotic_few_samples.append(antibiotic)
        continue   
    PredR_Antibiotic(antibiotic)
    antibiotic_done.append(antibiotic)
gene_antibiotic_df.to_csv("Gene_Antibiotic_17/07.csv", index=True, index_label=False)

************************
 chloramphenicol
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Loss: 0.08215504884719849
Mean Squared Error: 0.013582312501966953
len of pred: (2467, 1) y_test: (2467,)
Mean Squared Error: 0.013582314756362412
Mean Absolute Error: 0.03435232680642779
R^2 Score: 0.7973743101450266
************************
 clindamycin
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
