In [None]:
#scaled
import cv2
import numpy as np
import pickle
import glob
import os
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as pl

path_txt = "D:/ML/IR_test/fig/resize/txt/*.txt"
text_data = []
inchi_data = []
mol_data = []
mass_data = []
mol_num = []
formula = []

for file in glob.glob(path_txt):
    with open(file, 'r') as f:
        lines = f.readlines()
        if len(lines) >= 6:
            sdbs = lines[0].strip()  #SDBS No.
            text = lines[1].strip()  # IUPAC name
            inchi = lines[2].strip() #InChI
            mol = lines[3].strip()   # Molecular formula
            mass = float(lines[4].strip())  # Molecular weight
            cas = lines[5].strip()  #CAS No.
            
            
            text_data.append(text)
            inchi_data.append(inchi)
            mol_data.append(mol)
            mass_data.append(mass)
            

# Convert IUPAC names to feature vectors using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(text_data)

# Save the trained vectorizer to a file
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

# Convert the feature matrix to a NumPy array
X_text = X.toarray()

# Define atomic weights for various elements
atomic_weights = {'C': 12, 'H': 1, 'O': 16, 'N': 14, 'S': 32, 'Cl': 35.5, 'Li': 7, 'Cu':63.6, 'Co': 58.9, 'Br': 79.9, 'F': 19, 'B': 10.8}

def calculate_mole_fraction(formula):
    # Parse the chemical formula to get elements and their counts
    elements = {}
    for i in range(len(formula)):
        if formula[i].isupper():
            if i > 0:
                if formula[i-1].isdigit():
                    count = int(formula[i-1])
                    elements[element] = count
                else:
                    elements[element] = 1
            element = formula[i]
        elif formula[i].islower():
            element += formula[i]
        elif formula[i].isdigit():
            count = int(formula[i])
            elements[element] = count
    
    # Add elements with count 0 if they are not present in the formula
    for element in atomic_weights.keys():
        if element not in elements.keys():
            elements[element] = 0
    
    # Calculate mole fractions based on element counts
    total_weight = sum(elements[element] * atomic_weights[element] for element in elements)
    mole_fractions = {element: (elements[element] * atomic_weights[element]) / total_weight for element in elements}
    
    return mole_fractions


#Store the mole fractions
formula.extend(mol_data)
mf = []
for i in range(len(formula)):
    mole_fractions = calculate_mole_fraction(formula[i])
    mf.append(list(mole_fractions.values()))

# Convert mole fractions to a NumPy array
X_mol = np.array(mf)

#Add molecular weight data to the dataset
X_mass = np.array(mass_data)

# Reshape molecular weight data
X_mass = np.array(X_mass).reshape(-1, 1)
    
#Read image files and add to the dataset
path_img = "D:/ML/IR_test/fig/resize/fig/*.jpg"
image_data = []
for file in glob.glob(path_img):
    image = cv2.imread(file, cv2.IMREAD_GRAYSCALE)
    #image = cv2.resize(image, (0,0), fx=0.04, fy=0.04)
    image_array = np.array(image).reshape(image.shape[0], -1)
    image_data.append(image_array)
    

# Match text and image data by comparing filenames to get indices
index_data = []
for file in glob.glob(path_txt):
    filename = os.path.basename(file)
    index = int(filename.split('.')[0])
    index_data.append(index)

#Convert index data to a NumPy array
X_index = np.array(index_data).reshape(-1, 1)
X_image = np.array(image_data)
X_image = X_image.reshape(X_image.shape[0], -1)

from sklearn.preprocessing import MinMaxScaler

# Scale features for each dataset
scaler = MinMaxScaler()
X_text_scaled = scaler.fit_transform(X_text)
X_mol_scaled = scaler.fit_transform(X_mol)
X_mass_scaled = scaler.fit_transform(X_mass)
X_image_scaled = scaler.fit_transform(X_image)


# Define num_of_new_columns
num_of_new_columns = 1

X_zero = np.zeros((len(X.toarray()), num_of_new_columns))
X_new = np.concatenate((X.toarray(), X_zero), axis=1)

# Define the number of new columns to add
#X = np.hstack((X_new, X_image))
X = np.hstack((X_mol_scaled, X_mass_scaled, X_image_scaled))
X = X.astype(np.float64)
X = X[~np.isnan(X).any(axis=1)]

