In [None]:
import cv2
import numpy as np
import pickle
import glob
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from rdkit import Chem
from rdkit.Chem import MACCSkeys

# Setting the file path
path_txt = "D:/ML/IR_test/fig/resize/txt/*.txt"
text_data = []
inchi_data = []
mol_data = []
mass_data = []
mol_num = []
formula = []
index_data = []  # Added

# Reading the text data
for file in glob.glob(path_txt):
    with open(file, 'r') as f:
        lines = f.readlines()
        if len(lines) >= 6:
            sdbs = lines[0].strip()  # SDBS No.
            text = lines[1].strip()  # IUPAC name
            inchi = lines[2].strip()  # InChI
            mol = lines[3].strip()   # Molecular formula
            mass = float(lines[4].strip())  # Molecular weight
            cas = lines[5].strip()  # CAS No.

            # Adding to data lists
            text_data.append(text)
            inchi_data.append(inchi)
            mol_data.append(mol)
            index_data.append(sdbs)  # Adding SDBS No. to index_data

# Generating MACCS keys
maccs_keys_data = []

for inchi in inchi_data:
    try:
        # Add "InChI=" if it's missing
        if not inchi.startswith("InChI="):
            inchi = "InChI=" + inchi

        # Create a molecule object from InChI
        molecule = Chem.MolFromInchi(inchi)
        
        if molecule:
            # Calculate MACCS keys
            maccs_keys = MACCSkeys.GenMACCSKeys(molecule)
            maccs_keys_data.append(list(maccs_keys))
        else:
            # If molecule creation fails, append None
            maccs_keys_data.append(None)
    except Exception as e:
        print(f"Error processing InChI: {inchi}, Error: {str(e)}")
        maccs_keys_data.append(None)

# Excluding None and converting to NumPy array
maccs_keys_data = [key for key in maccs_keys_data if key is not None]
X = np.array(maccs_keys_data)

# Converting index data to NumPy array
X_index = np.array(index_data).reshape(-1)

# Clustering
Z = linkage(X, method='ward', metric='euclidean')

# Plotting the dendrogram
fig = plt.figure(figsize=(50, 10), dpi=300)
dn = dendrogram(Z, labels=X_index)

# Setting labels for the axes
plt.xlabel('SDBS No.', fontsize=20)
plt.ylabel('Distance', fontsize=20)

# Adjusting the font size for x-axis
plt.xticks(fontsize=15)

plt.show()


In [None]:
import os
import numpy as np
from PIL import Image, ImageSequence
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram
import glob

# Path to the folder containing the GIF files
folder_path = "D:/ML/IR_rev/cropped_images_new8"

# Set the threshold for black pixels (pure black is 0)
threshold = 0

# Get all GIF files in the folder
gif_files = [f for f in os.listdir(folder_path) if f.endswith('.gif')]

# List to store image data
image_data = []

# List to store index data
index_data = []

# Process each GIF file
for gif_file in gif_files:
    gif_path = os.path.join(folder_path, gif_file)
    print(f"Processing {gif_file}...")

    # Open the GIF image
    gif = Image.open(gif_path)

    # Process each frame
    for i, frame in enumerate(ImageSequence.Iterator(gif)):
        # Convert the frame to grayscale
        frame = frame.convert('L')
        frame_array = np.array(frame)

        # Get the positions of black pixels
        black_pixel_positions = np.where(frame_array <= threshold)

        # List to store the Y coordinates (initial value is -1 for no black pixel)
        y_values = [-1] * frame_array.shape[1]

        # For each column, select the lowest (most bottom) black pixel
        for x in range(frame_array.shape[1]):  # Horizontal axis
            y_positions = black_pixel_positions[0][black_pixel_positions[1] == x]  # Vertical axis (y-coordinate)
            if len(y_positions) > 0:
                max_y_position = np.max(y_positions)  # Get the maximum Y-coordinate
                y_values[x] = max_y_position  # Store the Y value in the X order

        # Store the image data
        image_data.append(y_values)

        # Get the index from the file name (e.g., 1.gif → 1)
        index = int(gif_file.split('.')[0])
        index_data.append(index)

# Convert to NumPy arrays
X_image = np.array(image_data)
X_index = np.array(index_data).reshape(-1)

# Display the shape of the image data
print("Shape of image data:", X_image.shape)

# Hierarchical clustering
Z = linkage(X_image, method='ward', metric='euclidean')

# Plot the dendrogram
fig = plt.figure(figsize=(50, 10), dpi=300)
dn = dendrogram(Z, labels=X_index)

# Change the font size of the x-axis label
plt.xlabel('SDBS No.', fontsize=20)

# Change the font size of the y-axis label
plt.ylabel('Distance', fontsize=20)

# Change the font size of the x-axis ticks
plt.xticks(fontsize=15)

plt.show()
