Compute the predicted rating matrix. This will be use to perform predictions.

In [None]:
import numpy as np
import pandas as pd

# Code to calculate the predicted rating matrix in human-readable format
def compute_predicted_rating_matrix(P, Q, T, D, item_biases, user_biases, global_bias, user_features):
    # Compute interaction terms
    t_u = user_features @ T
    d_u = user_features @ D
    
    # Compute predicted ratings matrix
    predicted_ratings = np.dot(P + t_u, Q) + item_biases + user_biases[:, np.newaxis] + global_bias + d_u[:, np.newaxis]
    
    return predicted_ratings

# Example usage:
# Assuming P, Q, T, D, item_biases, user_biases, global_bias, and user_features are already trained and available
predicted_rating_matrix = compute_predicted_rating_matrix(P, Q, T, D, item_biases, user_biases, global_bias, user_features)

# Convert the matrix to a DataFrame for better visualization
molecules_indices = range(user_biases.shape[0])
proteins_indices = range(144) 
predicted_rating_df = pd.DataFrame(predicted_rating_matrix, index=molecules_indices, columns=proteins_indices)

# Print the DataFrame
display(predicted_rating_df)
# TODO use dataframe to lookup pairs of the test.csv => first need to add column and row identifiers

We have two groups of molecules we want to predict the activity for a specific protein: Molecules that were already present on the training data; molecules that were never seen before in the training data, therefore are not yet in the "system" (new users).

Predicting the acitivity for a protein of a molecule that is already in the "system" is easy. We just need to make a "lookup" in our predicted rating matrix.

In [None]:
# Define function to create interaction matrix
def create_interaction_matrix(df, molecule_col, protein_col, activity_col):
    interactions = df.pivot_table(index=molecule_col, columns=protein_col, values=activity_col, aggfunc='mean')
    interactions = interactions.fillna(0)
    return interactions


# Compute predicted rating matrix
predicted_rating_matrix = compute_predicted_rating_matrix(P, Q, T, D, item_biases, user_biases, global_bias, molecule_features)

# Create the interaction matrix from your dataset
interactions = create_interaction_matrix(df_activity, 'ChEMBL_IDs', 'Uniprot_IDs', 'Activity')

# Get the indices (molecule IDs) and columns (protein IDs) from the interaction matrix
molecule_ids = interactions.index
protein_ids = interactions.columns

# Create the DataFrame for the predicted ratings with the same indices and columns
predicted_rating_df = pd.DataFrame(predicted_rating_matrix, index=molecule_ids, columns=protein_ids)

# Alternatively, display the DataFrame in a Jupyter Notebook
display(predicted_rating_df)

In [None]:
def get_predicted_rating(uniprot_id, chembl_id):
    if chembl_id in predicted_rating_df.index and uniprot_id in predicted_rating_df.columns:
        return predicted_rating_df.at[chembl_id, uniprot_id]
    else:
        return None  # or some default value like 0

# Update the 'Activity' column in df_activity_validation
df_activity_validation['Activity'] = df_activity_validation.apply(
    lambda row: get_predicted_rating(row['Uniprot_IDs'], row['ChEMBL_IDs']), axis=1
)

# Print the updated df_activity_validation
df_activity_validation

After predicting these activities we round the values to a whole integer.

In [None]:
df_activity_validation['Activity'] = df_activity_validation['Activity'].apply(lambda x: np.nan if pd.isna(x) else max(0, round(x)))

# Print the updated DataFrame
df_activity_validation

We are saving the rows that are empty in a csv. And in the next step read it in. 

In [None]:
df_empty_activity  = pd.read_csv('df_empty_activity.csv')

activity_train = pd.read_csv('activity_train.csv', header=None)
# Define your column headers
column_headers = ['Uniprot_IDs', 'ChEMBL_IDs', 'Activity']  
# Read the CSV file with custom headers
activity_train = pd.read_csv('activity_train.csv', names=column_headers)

activity_test_blanked = pd.read_csv('activity_test_blanked.csv', header=None)
# Define your column headers
column_headers = ['Uniprot_IDs', 'ChEMBL_IDs', 'Activity']  
# Read the CSV file with custom headers
activity_test_blanked = pd.read_csv('activity_test_blanked.csv', names=column_headers)

mol_bits = pd.read_pickle('mol_bits.pkl')

def create_sparse_matrix(data):
    ChEMBL_IDs = list(data.keys())
    structural_features = list(data.values())

    # Find the number of unique features
    num_features = max(max(indices) for indices in structural_features) + 1

    # Prepare data for csr_matrix
    matrix_data = []
    rows = []
    cols = []

    for row, indices in enumerate(structural_features):
        rows.extend([row] * len(indices))
        cols.extend(indices)
        matrix_data.extend([1] * len(indices))

    # Create the sparse matrix
    sparse_matrix = csr_matrix((matrix_data, (rows, cols)), shape=(len(ChEMBL_IDs), num_features))
    
    return sparse_matrix, ChEMBL_IDs

sparse_matrix, ChEMBL_IDs = create_sparse_matrix(mol_bits)
dense_matrix_full = sparse_matrix.toarray()
dense_matrix_full

df_sparse_matrix = pd.DataFrame(dense_matrix_full, index=ChEMBL_IDs)

def find_most_similar_molecules_above_threshold(df_empty_activity, df_sparse_matrix, threshold=0.8):
    results = []
    for index, row in df_empty_activity.iterrows():
        chemble_id = row['ChEMBL_IDs']
        similar_molecules = []
        
        # Check if ChEMBL_ID exists in df_sparse_matrix
        if chemble_id not in df_sparse_matrix.index:
            similar_molecules.append({
                "Most Similar Molecule": "NA",
                "Jaccard Similarity": 0
            })
        else:
            molecule_vector = df_sparse_matrix.loc[chemble_id]
            for other_chemble_id, other_vector in df_sparse_matrix.iterrows():
                if other_chemble_id == chemble_id:
                    continue  # Skip comparing the molecule to itself
                intersection = (molecule_vector & other_vector).sum()
                union = (molecule_vector | other_vector).sum()
                jaccard_score = intersection / union if union > 0 else 0
                if jaccard_score >= threshold:
                    similar_molecules.append({
                        "Most Similar Molecule": other_chemble_id,
                        "Jaccard Similarity": jaccard_score
                    })
        
        results.append({
            "Original Molecule": chemble_id,
            "Similar Molecules Above Threshold": similar_molecules
        })

    return pd.DataFrame(results)

result_df = find_most_similar_molecules_above_threshold(df_empty_activity, df_sparse_matrix, threshold=0.6)

# Convert the 'Similar Molecules Above Threshold' column to a structured string
def format_similar_molecules(similar_molecules):
    if isinstance(similar_molecules, list) and similar_molecules:
        return "; ".join([f"{m['Most Similar Molecule']}:{m['Jaccard Similarity']}" for m in similar_molecules])
    return "NA"

result_df['Similar Molecules Above Threshold'] = result_df['Similar Molecules Above Threshold'].apply(format_similar_molecules)

# Save to CSV
result_df.to_csv('most_similar_molecules.csv', index=False)

Predicting the "new users" is harder. Our solution to that problem was to just calculate the jaccard similarity for all other molecules of these remaining molecules as they are not that many (1250 molecules in total). We can use the pickle file which includes fingerprints of the molecules, allowing us to perform a simlarity calculation. After running that code, we save it in a csv for easier reuse. This csv includes all the molecules that haven't been seen before, with their most similar molecules based on the jaccard similarity using a threshold for 0.6.

In [None]:
# To read back the CSV and parse the structured string
def parse_similar_molecules(formatted_str):
    if pd.isna(formatted_str) or formatted_str == "NA":
        return []
    return [{"Most Similar Molecule": m.split(":")[0], "Jaccard Similarity": float(m.split(":")[1])} for m in formatted_str.split("; ")]

# Example of reading back the CSV
def read_results_from_csv(file_path):
    df = pd.read_csv(file_path)
    df['Similar Molecules Above Threshold'] = df['Similar Molecules Above Threshold'].apply(parse_similar_molecules)
    return df

# Read the CSV
similar_molecules_df = read_results_from_csv('most_similar_molecules.csv')

In [None]:
similar_molecules_df

Lastly we can use the similar molecules for the "new" molecules and take the average of the similar molecules for the protein that should be predicted.

In [None]:
# Function to predict the value for a given molecule and protein using similar molecules
def predict_value(molecule, protein, similar_molecules_df, predicted_rating_df):
    similar_molecules_entry = similar_molecules_df[similar_molecules_df['Original Molecule'] == molecule]
    if similar_molecules_entry.empty:
        return None  # Return None if no similar molecules are found

    similar_molecules = similar_molecules_entry.iloc[0]['Similar Molecules Above Threshold']
    
    if not similar_molecules:
        return None  # Return None if the similar molecules list is empty

    # Extract the values for the specific protein column for all similar molecules
    values = []
    for similar in similar_molecules:
        similar_molecule = similar['Most Similar Molecule']
        if similar_molecule in predicted_rating_df.index and protein in predicted_rating_df.columns:
            value = predicted_rating_df.loc[similar_molecule, protein]
            values.append(value)

    if not values:
        return None  # Return None if no values are found

    # Compute the average of the values and round it to the nearest integer
    predicted_value = round(np.mean(values))
    return predicted_value

# Identify rows in the validation data that have missing activity values and are in the similar molecules DataFrame
missing_activity_mask = df_activity_validation['Activity'].isna()
original_molecules = similar_molecules_df['Original Molecule'].unique()
mask = missing_activity_mask & df_activity_validation['ChEMBL_IDs'].isin(original_molecules)

# Filter these rows
missing_activity_rows = df_activity_validation[mask]

# Update missing activity values using similar molecules
for index, row in missing_activity_rows.iterrows():
    molecule = row['ChEMBL_IDs']
    if molecule in original_molecules:  # Check if molecule is in the original molecules
        protein = row['Uniprot_IDs']
        predicted_activity = predict_value(molecule, protein, similar_molecules_df, predicted_rating_df)
        if predicted_activity is not None:
            df_activity_validation.at[index, 'Activity'] = predicted_activity

In [None]:
df_activity_validation[df_activity_validation["Activity"].isna()]

In [None]:
df_activity_validation.to_csv('updated_activity_validation.csv', index=False)