In [None]:
import subprocess
import os
print('Current conda environment:', os.environ['CONDA_DEFAULT_ENV'])
os.environ['TOKENIZERS_PARALLELISM'] = "false"

cwd = os.getcwd()
print(cwd)

import warnings
warnings.filterwarnings('ignore')

import random
random.seed(42)

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc = {'figure.figsize':(15,8)})

from crem.crem import grow_mol, mutate_mol
crem_db = '../crem_db/crem_db2.5.db'

import mols2grid

from rdkit import Chem
from rdkit.Chem import AllChem, rdFingerprintGenerator, CanonSmiles, Draw, MolFromSmiles, PandasTools
from rdkit.Chem.rdmolops import RDKFingerprint
from rdkit.Chem.Draw import MolsToGridImage
from rdkit import DataStructs
from rdkit.Chem.rdFMCS import FindMCS
from rdkit.DataStructs.cDataStructs import BulkTanimotoSimilarity
import useful_rdkit_utils as uru

import prolif as plf

import safe as sf
import datamol as dm

import mols2grid

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, classification_report
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import torch

from coati.generative.coati_purifications import embed_smiles
from coati.models.io.coati import load_e3gnn_smiles_clip_e2e
from coati.models.simple_coati2.io import load_coati2

In [None]:
pdb = '2zdt'

In [None]:
initial_mol = Chem.MolFromMolFile(f"data/docking/{pdb}_ligand.sdf")
initial = Chem.MolToSmiles(initial_mol)

MolsToGridImage([Chem.MolFromSmiles(initial)], subImgSize=(600, 600))

In [None]:
initial

### Interaction fingerprint for reference molecule

In [None]:
REF_MOL_FILEPATH = f"data/docking/{pdb}_ligand.sdf"
PDB_FILEPATH = f"data/docking/{pdb}.pdb"

fp = plf.Fingerprint()

mol = Chem.MolFromPDBFile(PDB_FILEPATH, removeHs=False)
prot = plf.Molecule(mol)
suppl = plf.sdf_supplier(REF_MOL_FILEPATH)
fp.run_from_iterable(suppl, prot, progress=True)
df_ifp = fp.to_dataframe()
df_ifp.columns = df_ifp.columns.droplevel(0)

In [None]:
df_ifp

In [None]:
def ifp_similarity(ref_mol_ifp, df_ifp, df):
    ## Rename columns
    df_ifp.columns = [' '.join(col) if isinstance(col, tuple) else col for col in df_ifp.columns]
    ref_mol_ifp.columns = [' '.join(col) if isinstance(col, tuple) else col for col in ref_mol_ifp.columns]
    

    intersections = []
    weighted_intersections = []

    #iterate over the rows
    for index, row in df_ifp.iterrows():
        count=0
        weighted_count = 0
        #iterate over all columns
        for col_name in df_ifp.columns:
            if col_name in ref_mol_ifp.columns and df_ifp[col_name][index]==ref_mol_ifp[col_name][0] and 'VdWContact' in col_name:
                count += 1
                weighted_count += 1
            elif col_name in ref_mol_ifp.columns and df_ifp[col_name][index]==ref_mol_ifp[col_name][0] and 'Hydrophobic' in col_name:
                count += 1
                weighted_count += 2
            elif col_name in ref_mol_ifp.columns and df_ifp[col_name][index]==ref_mol_ifp[col_name][0] and 'HBAcceptor' in col_name:
                count += 1
                weighted_count += 3
            elif col_name in ref_mol_ifp.columns and df_ifp[col_name][index]==ref_mol_ifp[col_name][0] and 'Anionic' in col_name or 'Cationic' in col_name:
                count += 1
                weighted_count += 4
        
        intersections.append(count)
        weighted_intersections.append(weighted_count)
                
    df['IFP Intersection'] = intersections
    df['Weighted IFP Intersection'] = weighted_intersections

    return df


In [None]:
# Function that takes as input two dataframes with the same number of rows and computes number of IMFs of each molecule wrt protein
def compute_features(df, ifp):

   # List of new columns to add
    new_columns = ['num_interactions', 'weighted_interactions', 'num_VdW', 'num_hydrophobic', 'num_HBAcceptor', 'num_ionic']

    # Create a dictionary of new columns with pd.NA values
    new_cols_dict = {col: pd.NA for col in new_columns}

    # Add multiple empty columns
    df = df.assign(**new_cols_dict)
   
    cols = ifp.columns

    data = {'mol_id' : [],
            'num_interactions' : [],
            'weighted_interactions' : [],
            'num_VdW' : [],
            'num_hydrophobic' : [],
            'num_HBAcceptor' : [],
            'num_ionic' : []}
    
    for index, row in ifp.iterrows():

        weighted_interactions = 0
        num_VdW = 0
        num_hydrophobic = 0
        num_HBAcceptor = 0
        num_ionic = 0
        
        # data['mol_id'].append(df.row['ID'][0])
        # data['num_interactions'].append(row[:-1].sum())

        for value in cols:
            
            if value[1] == 'VdWContact':
                weighted_interactions += 1 * row[value]
                num_VdW += 1 * row[value]
            elif value[1] == 'Hydrophobic':
                weighted_interactions += 2 * row[value]
                num_hydrophobic += 1 * row[value]
            elif value[1] == 'HBAcceptor':
                weighted_interactions += 3 * row[value]
                num_HBAcceptor += 1 * row[value]
            elif value[1] == 'Anionic' or value[1] == 'Cationic':
                weighted_interactions += 4 * row[value]
                num_ionic += 1 * row[value]

            num_interactions=num_VdW + num_HBAcceptor + num_hydrophobic + num_ionic
        df['weighted_interactions'][index]=weighted_interactions
        df['num_VdW'][index]=num_VdW
        df['num_hydrophobic'][index]=num_hydrophobic
        df['num_HBAcceptor'][index]=num_HBAcceptor
        df['num_ionic'][index]=num_ionic
        df['num_interactions'][index]=num_interactions
    
    
    
    return df 

    ### OLD CODE IN CASE FUNCTION DOESN'T WORK
    
    # data['weighted_interactions'].append(weighted_interactions)
    # data['num_VdW'].append(num_VdW)
    # data['num_hydrophobic'].append(num_hydrophobic)
    # data['num_HBAcceptor'].append(num_HBAcceptor)
    # data['num_ionic'].append(num_ionic) 

    # features = pd.DataFrame(data)
    
    # df = df.append(features[['mol_id', 'num_interactions', 'weighted_interactions', 'num_VdW', 'num_hydrophobic', 'num_HBAcceptor', 'num_ionic']], left_on='ID', right_on='mol_id', how='left')

    # df = df.drop(['mol_id'], axis=1).sort_values(['Docking score'], ascending=True)

    # df.dropna(axis=0, subset=['Docking score'], inplace=True)
    # df['num_interactions'].fillna(0, inplace=True)
    # df['weighted_interactions'].fillna(0, inplace=True)

In [None]:
def visualize_fingerprint(ifp):

    sns.set(rc = {'figure.figsize':(15,8)})
    ax = sns.heatmap(ifp,cmap=sns.cm.rocket_r)
    ax.set_ylabel("Molecule")
    ax.set_xlabel("Protein Interaction")

    return ax

## Reinvent

In [None]:
model = 'reinvent'

arg1 = '--model'
arg2 = '--sample'
arg3 = '--dock'
arg4 = '--pdb'

args = ['python3', 'generate_analogs.py',
        arg1, model,
        arg2, '200',
        arg3,
        arg4, pdb]

# Change directory to generate analogs with python script
%cd ..

subprocess.run(args,
               stdout=subprocess.DEVNULL,
               stderr=subprocess.STDOUT)
        
# Change directory back to that of the current notebook
%cd experiments

In [None]:
model = 'reinvent'

DF_FILEPATH = f'data/{model}_dataframe.csv'
IFP_FILEPATH = f'data/{model}_ifp.csv'

df_reinvent = pd.read_csv(DF_FILEPATH, index_col=0)

ifp_reinvent = pd.read_csv(IFP_FILEPATH, header=[0, 1], index_col=0)

Compute metrics

In [None]:
#Compute length to check that no molecules are being filtered by metric computation
len(df_reinvent)

In [None]:
#number of IMFs
df_reinvent = compute_features(df_reinvent, ifp_reinvent)
# Compare IMFs to initial fragment
df_reinvent = ifp_similarity(df_ifp, ifp_reinvent, df_reinvent)

df_reinvent.drop(['Input_SMILES', 'Prior', 'Tanimoto'], axis=1, inplace=True)
df_reinvent['Model'] = model


In [None]:
#Check that number of rows matches and visualize data_frame
df_reinvent

### CReM

In [None]:
model = 'crem'

arg1 = '--model'
arg2 = '--sample'
arg3 = '--dock'
arg4 = '--pdb'

args = ['python3', 'generate_analogs.py',
        arg1, model,
        arg2, '200',
        arg3,
        arg4, pdb]

# Change directory to generate analogs with python script
%cd ..

subprocess.run(args,
               stdout=subprocess.DEVNULL,
               stderr=subprocess.STDOUT)
        
# Change directory back to that of the current notebook
%cd experiments

In [None]:
model = 'crem'

DF_FILEPATH = f'data/{model}_dataframe.csv'
IFP_FILEPATH = f'data/{model}_ifp.csv'

df_crem = pd.read_csv(DF_FILEPATH, index_col=0)

ifp_crem = pd.read_csv(IFP_FILEPATH, header=[0, 1], index_col=0)

In [None]:
len(ifp_crem)

In [None]:
#Compute length to check that no molecules are being filtered by metric computation
len(df_reinvent)

Compute metrics

In [None]:
#number of IMFs
df_crem = compute_features(df_crem, ifp_crem)
len(df_crem)

In [None]:
# Compare IMFs to initial fragment
df_crem = ifp_similarity(df_ifp, ifp_crem, df_crem)

In [None]:
df_crem

In [None]:
model_df = pd.concat((df_reinvent, df_crem))

### Coati

In [None]:
model = 'coati'

arg1 = '--model'
arg2 = '--sample'
arg3 = '--dock'
arg4 = '--pdb'

args = ['python3', 'generate_analogs.py',
        arg1, model,
        arg2, '200',
        arg3,
        arg4, pdb]

# Change directory to generate analogs with python script
%cd ..

subprocess.run(args,
               stdout=subprocess.DEVNULL,
               stderr=subprocess.STDOUT)
        
# Change directory back to that of the current notebook
%cd experiments

In [None]:
model = 'coati'

DF_FILEPATH = f'data/{model}_dataframe.csv'
IFP_FILEPATH = f'data/{model}_ifp.csv'

df_coati = pd.read_csv(DF_FILEPATH, index_col=0)

ifp_coati = pd.read_csv(IFP_FILEPATH, header=[0, 1], index_col=0)

In [None]:
ifp_coati

In [None]:
df_coati = compute_features(df_coati, ifp_coati)

In [None]:
df_coati = ifp_similarity(df_ifp, ifp_coati, df_coati)

In [None]:
model_df = pd.concat((model_df, df_coati))

### SAFE

In [None]:
model = 'safe'

arg1 = '--model'
arg2 = '--sample'
arg3 = '--dock'
arg4 = '--pdb'

args = ['python3', 'generate_analogs.py',
        arg1, model,
        arg2, '200',
        arg3,
        arg4, pdb]

# Change directory to generate analogs with python script
%cd ..

subprocess.run(args,
               stdout=subprocess.DEVNULL,
               stderr=subprocess.STDOUT)
        
# Change directory back to that of the current notebook
%cd experiments

In [None]:
model = 'safe'

DF_FILEPATH = f'data/{model}_dataframe.csv'
IFP_FILEPATH = f'data/{model}_ifp.csv'

df_safe = pd.read_csv(DF_FILEPATH, index_col=0)

ifp_safe = pd.read_csv(IFP_FILEPATH, header=[0, 1], index_col=0)

In [None]:
df_safe = compute_features(df_safe, ifp_safe)

In [None]:
df_safe = ifp_similarity(df_ifp, ifp_safe, df_safe)

In [None]:
model_df = pd.concat((model_df, df_safe))

In [None]:
model_df.head()

In [None]:
smiles = model_df['SMILES'].to_list()

In [None]:
y = model_df['Model'].to_numpy().reshape(-1, 1)

### Evaluating Metrics w/ MolScore

In [None]:
from molscore import MolScore

In [None]:
ms = MolScore(model_name='mol2mol', task_config='molscore/feature_selection.json')
scores = ms.score(smiles)

In [None]:
# Once finished
metrics = ms.compute_metrics(
    endpoints=None, # Optional list: by default will use the running final score/reward value
    thresholds=None,  # Optional list: if specified will calculate the yield of molecules above that threshold 
    # chemistry_filters_basic=False,  # Optional, bool: Additionally re-calculate metrics after filtering out unreasonable chemistry
    budget=10000,  # Optional, int: Calculate metrics only with molecules within this budget
    n_jobs=1,  # Optional, int: Multiprocessing
    benchmark=None,  # Optional, str: Name of benchmark, this may specify additional metrics to compute
)

In [None]:
df = pd.read_csv('molscore/2024_07_23_mol2mol_feature_selection/iterations/000001_scores.csv', index_col=0)

In [None]:
df

In [None]:
# df.drop(['desc_MolecularFormula', 'dice_Cmpd1_Sim', 'tanimoto_Cmpd1_Sim', 'desc_SAscore', 'desc_PenLogP'], axis=1, inplace=True)

In [None]:
X = df.drop(['smiles', 'model', 'task', 'step',
            'batch_idx', 'absolute_time',
            'valid', 'valid_score', 'unique',
            'occurrences', 'desc_MolecularFormula',
            'dice_Sim', 'dice_Cmpd1_Sim',
            'tanimoto_Sim', 'tanimoto_Cmpd1_Sim',
            'desc_SAscore', 'desc_PenLogP',
            'desc_MolWt', 'desc_NumHAcceptors',
            'desc_NumHDonors', 'desc_CLogP',
            'desc_TPSA', 'desc_NumRotatableBonds',
            'desc_MaxConsecutiveRotatableBonds',
            'desc_NumAromaticRings', 'desc_FlourineCount',
            'desc_FormalCharge', 'desc_RingCount',
            'desc_NumAliphaticRings', 'desc_HeavyAtomCount',
            'desc_HeavyAtomMolWt', 'amean', 'filter',
            'score_time', 'raw_valid_score'], axis=1)

In [None]:
model_df['num_interactions'][:200].nunique(), model_df['num_interactions'][200:400].nunique(), model_df['num_interactions'][400:600].nunique(), model_df['num_interactions'][600:].nunique()

Adding new features

In [None]:
weighted_ifp_intersection = model_df['Weighted IFP Intersection'].values
X['Weighted IFP Similarity'] = weighted_ifp_intersection / model_df['weighted_interactions'].values

X['Docking score'] = model_df['Docking score'].values
X['rmsd'] = model_df['rmsd'].values
# X['num_interactions'] = model_df['num_interactions'].values
# X['weighted_interactions'] = model_df['weighted_interactions'].values
X['Interaction Weight Ratio'] = model_df['weighted_interactions'].values / model_df['num_interactions'].values

In [None]:
# X['Docking score'] = model_df['Docking score'].values
# X['num_interactions'] = model_df['num_interactions'].values
# X['weighted_interactions'] = model_df['weighted_interactions'].values
# X['interaction weight ratio'] = model_df['weighted_interactions'].values / model_df['num_interactions'].values
# X['num_VdW'] = model_df['num_VdW'].values
# X['num_hydophobic'] = model_df['num_hydrophobic'].values
# X['num_HBAcceptor'] = model_df['num_HBAcceptor'].values
# X['num_ionic'] = model_df['num_ionic'].values
# X['CLogP * num_interactions'] = df['desc_CLogP'] * X['num_interactions']
# X['CLogP * weighted_interactions'] = df['desc_CLogP'] * X['weighted_interactions']
# X['rmsd'] = model_df['rmsd'].values
# X['IFP Intersection'] = model_df['IFP Intersection'].values
# X['Weighted IFP Intersection'] = model_df['Weighted IFP Intersection'].values
# X['IFP Similarity'] = X['IFP Intersection'].values / model_df['num_interactions'].values
# X['Weighted IFP Similarity'] = X['Weighted IFP Intersection'].values / model_df['weighted_interactions'].values

In [None]:
X.head()

#### Exploratory Data Analysis:

* Normalizing columns
* Evaluating correlations between features

In [None]:
column_names = {
    'desc_QED' : 'QED',
    'desc_Bertz' : 'Synthetic Complexity',
    'interaction weight ratio' : 'Avg Interaction Strength',
    'Weighted IFP Similarity' : 'Weighted Interaction Similarity',
    'rmsd' : 'RMSD',
    'RAScore_pred_proba' : 'Synthetic Accessibility',
    'desc_NumHeteroatoms' : '# Heteroatoms'
}

X.rename(columns=column_names, inplace=True)

In [None]:
X[:3].plot(kind='bar', legend=False, logy=True);

In [None]:
X_normalized = X.copy()

# Normalizing each column using min-max scaler
for column in X.columns:
   
   X_normalized[column] = (X_normalized[column] - X_normalized[column].min()) / (X_normalized[column].max() - X_normalized[column].min())     

In [None]:
X_normalized[:3].plot(kind='bar', legend=False, logy=True);

In [None]:
enc = OrdinalEncoder()

ord = enc.fit_transform(y)

In [None]:
X['Model'] = ord.squeeze()

In [None]:
X.columns

In [None]:
X.corr().style.background_gradient(cmap='coolwarm', vmin=-1, vmax=1)

In [None]:
corrs = X.iloc[:15].copy()

In [None]:
fig, ax = plt.subplots()
sns.heatmap(X.corr(method='pearson'), annot=True, fmt='.3f', 
            cmap=plt.get_cmap('coolwarm'), cbar=True, ax=ax)
ax.set_yticklabels(ax.get_yticklabels(), rotation="horizontal")
plt.xticks(rotation=40);

## Random Forest Classification

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')

In [None]:
enc.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=.2, random_state=1)

In [None]:
len(X_train)

In [None]:
len(X_test)

In [None]:
rf = RandomForestClassifier(n_estimators=250,
                            class_weight='balanced',
                            random_state=1)

rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
precision = precision_score(y_pred, y_test, average='micro')

print(f'Average precision: {precision:.3f}')

print(classification_report(y_test, y_pred))

In [None]:
weights = rf.feature_importances_
vars = X_normalized.columns

d = {'Features': vars, 'Weights': weights}

rf_features = pd.DataFrame(data=d).sort_values(['Weights'], ascending=False, ignore_index=True)

rf_features

In [None]:
fig, ax = plt.subplots()

sns.barplot(x = 'Features',
			y = 'Weights',
			data = rf_features.loc[:15],
            palette='colorblind',
            ax=ax)

# plt.title('Ranking of Random Forest Features')
ax.set_xlabel('Features', fontsize=20)
ax.set_ylabel('Feature Importance', labelpad=25, fontsize=20)

plt.xticks(rotation=90)

# Show the plot
plt.show()


## Visualizations

### Visualizing Model Differences

In [None]:
X['SMILES'] = df['smiles']
X['Tanimoto'] = df['tanimoto_Sim']

In [None]:
X.to_csv('data/features')

In [None]:
model_df['Weighted Interaction Similarity'] = X['Weighted Interaction Similarity']

In [None]:
df = df.merge(model_df[['SMILES', 'Model', 'Docking score', 'num_interactions', 'weighted_interactions', 'num_VdW', 'num_hydrophobic', 'num_HBAcceptor', 'num_ionic', 'rmsd', 'Weighted Interaction Similarity']], left_on='smiles', right_on='SMILES', how='left')

In [None]:
df

In [None]:
df.drop(['SMILES'], axis=1, inplace=True)

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

params = {'vert' : 0}

df.boxplot(column=['Docking score'], by='Model', ax=ax, **params)

ax = plt.gca()

In [None]:
# Create a FacetGrid with KDE plots
g = sns.FacetGrid(df, hue="Model", height=5, aspect=1.5)
g.map(sns.kdeplot, "Docking score", shade=True).add_legend()

# Add title and labels
g.fig.suptitle('Distributions of Docking Score by Model')
g.set_axis_labels('Docking Score', 'Density')

plt.show()

In [None]:
# Create a FacetGrid with KDE plots
g = sns.FacetGrid(df, hue="Model", height=5, aspect=1.5)
g.map(sns.kdeplot, "num_interactions", shade=True).add_legend()

# Add title and labels
g.fig.suptitle('Distributions of Docking Score by Model')
g.set_axis_labels('Docking Score', 'Density')

plt.show()

In [None]:
var = 'num_interactions'

plt.figure(figsize=(10, 6))
sns.histplot(data=df, x=var, hue='Model', multiple='dodge', palette='colorblind', bins=3, binwidth=.4,)

plt.xlabel('Number of interactions')
plt.ylabel('Frequency')
plt.title('Number of Protein Interactions by Model')

plt.show();

In [None]:
# Create a FacetGrid with KDE plots
g = sns.FacetGrid(df, hue="Model", height=5, aspect=1.5)
g.map(sns.kdeplot, "weighted_interactions", shade=True).add_legend()

# Add title and labels
g.fig.suptitle('Distributions of Docking Score by Model', y=1.02)
g.set_axis_labels('Docking Score', 'Density')

plt.show()

In [None]:
var = 'weighted_interactions'

plt.figure(figsize=(10, 6))
sns.histplot(data=df, x=var, hue='Model', multiple='dodge', palette='colorblind', bins=3, binwidth=.4,)

plt.xlabel('Number of interactions')
plt.ylabel('Frequency')
plt.title('Number of Protein Interactions by Model')

plt.show();

In [None]:
# Create a FacetGrid with KDE plots
g = sns.FacetGrid(df, hue="Model", height=5, aspect=1.5)
g.map(sns.kdeplot, "Weighted IFP Similarity", shade=True).add_legend()

# Add title and labels
g.fig.suptitle('Kernel Density Plot of Weighted IFP Similarity by Model', y=1.02)
g.set_axis_labels('Weighted Interaction Similarity', 'Density')

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

params = {'vert' : 0,
          'patch_artist' : True}

df.boxplot(column=['rmsd'], by='Model', ax=ax, **params)

ax = plt.gca()

plt.axvline(x=2, ls='dashed', c='green');

In [None]:
# Create a FacetGrid with KDE plots
g = sns.FacetGrid(df, hue="Model", height=5, aspect=1.5)
g.map(sns.kdeplot, "rmsd", shade=True).add_legend()
plt.axvline(x=2, ls='dashed', c='green')

# Add title and labels
g.fig.suptitle('Kernel Density Plot of RMSD', y=1.05)
g.set_axis_labels('RMSD', 'Density')

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

params = {'vert' : 0}

df.boxplot(column=['tanimoto_Sim'], by='Model', ax=ax, **params)

ax = plt.gca()

In [None]:
# Create a FacetGrid with KDE plots
g = sns.FacetGrid(df, hue="Model", height=5, aspect=1.5)
g.map(sns.kdeplot, "tanimoto_Sim", shade=True).add_legend()

# Add title and labels
g.fig.suptitle('Kernel Density Plot of Tanimoto Similarity', y=1.05)
g.set_axis_labels('Tanimoto Similarity', 'Density')

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

params = {'vert' : 0,
          'patch_artist' : True}

df.boxplot(column=['desc_Bertz'], by='Model', ax=ax, **params)

ax = plt.gca()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

params = {'vert' : 0,
          'patch_artist' : True}

df.boxplot(column=['desc_QED'], by='Model', ax=ax, **params)

ax = plt.gca()

In [None]:
var = 'num_ionic'

plt.figure(figsize=(10, 6))
sns.histplot(data=df, x=var, hue='Model', multiple='dodge', palette='colorblind', bins=3, binwidth=.4,)

plt.xlabel('Number of ionic bonds')
plt.ylabel('Frequency')
plt.title('Number of Ionic Bonds by Model')

plt.show();

### Visualizing clusters w/ PCA and t-SNE

In [None]:
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X_normalized)

In [None]:
pca = PCA(n_components=3, random_state=0)
pca_fps = pca.fit_transform(X_normalized)

In [None]:
var1, var2, var3 = pca.explained_variance_ratio_

In [None]:
var1, var2, var3

In [None]:
model_df['PC1'], model_df['PC2'], model_df['PC3'] = pca_fps.T[0], pca_fps.T[1], pca_fps.T[2]

In [None]:
plot_df = model_df.sample(500)

In [None]:
f = sns.pairplot(model_df,
                 hue='Model',
                 vars=['PC1', 'PC2'],
                 palette='colorblind',
                 aspect=2,
                 plot_kws=dict(s=10))

f.fig.suptitle(f'Pairwise Principle Component Plots, variance explained: {var1 + var2:.2f}', fontsize=18, y=1.04);

In [None]:
p = 50

pca_model = PCA(n_components=3, random_state=0)
tsne_model = TSNE(n_components=2, random_state=0, perplexity=p, n_iter=5000)
tsne_fps = tsne_model.fit_transform(pca_model.fit_transform(X))

In [None]:
model_df['TSNE1'], model_df['TSNE2'] = tsne_fps.T[0], tsne_fps.T[1]

In [None]:
plot_df = df.sample(n=500)

In [None]:
f = sns.pairplot(model_df,
                 hue='Model',
                 vars=['TSNE1', 'TSNE2'],
                 palette='colorblind',
                 aspect=2,
                 plot_kws=dict(s=10))

title = f'Pairwise t-SNE plot w/ perplexity $p={p}$'

f.fig.suptitle(title, fontsize=18, y=1.04);