## Libraries

In [None]:
import pandas as pd
import os
import tensorflow as tf
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sn
from sklearn import metrics
from tensorflow import keras
from keras import layers, models, callbacks
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
import numpy as np

# Creating and trainning model

## load and Preprocess data for model training 

### for 9 different gasses or states

In [None]:
folder_path = r"C:\Users\thaim\OneDrive\Desktop\Tal_Projects\Gas_detector\UV\Code\code_files\UV Spectrum\Data train\All_train_files"
LABELS = [ 'Ammonia','Benzene','H2S','Sulfur','Ozone','Toluene','Xylene','Regular','noise']  # Add more gas names as needed


# Load and concatenate all CSV files into one DataFrame
labels = []
all_data = []
inconsistent_data_count = 0  # Counter for inconsistent data entries

for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        print(f"Processing file: {filename}, shape: {df.shape}")

                # Drop completely blank rows and columns
        df.dropna(how='all', axis=0, inplace=True)  # Drop blank rows
        df.dropna(how='all', axis=1, inplace=True)  # Drop blank columns


        # Determine the gas type based on the filename
        gas_type = None
        for i, gas in enumerate(LABELS):
            if gas in filename:
                gas_type = i
                break

        if gas_type is not None:
            # Check if the file has one column or multiple columns
            if df.shape[1] < 3:
                # Single column file: add the entire column as one example
                if len(df) == 311:
                    all_data.append(df.iloc[:, 0].values.tolist())
                    labels.append(gas_type)
                else:
                    print(f"Skipping file {filename} due to incorrect length: {len(df)}")
            else:  # Multiple columns file
                for row_idx, row in df.iterrows():
                    if len(row) == 311:
                        all_data.append(row.tolist())
                        labels.append(gas_type)
                        print(f"Processed row {row_idx + 1} in multi-column file: {filename}")
                    else:
                        print(f"Skipping row {row_idx + 1} in {filename} due to incorrect length: {len(row)}")
                        inconsistent_data_count += 1

# Check all data entries for consistent length
if all(len(d) == 311 for d in all_data):
    X = np.array(all_data)
    y = np.array(labels)
    print("Data consistency check passed.")
else:
    print("Data consistency check failed. There are inconsistent entries.")

print(f"Total inconsistent data entries: {inconsistent_data_count}")

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

# Output shapes
print("X shape:", X.shape)
print("y shape:", y.shape)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

## Model definition

In [None]:
model = models.Sequential([
    layers.Dense(250, activation='tanh', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.2),  # Slightly lower dropout
    layers.Dense(180, activation='tanh'),
    layers.Dropout(0.2),
    layers.Dense(100, activation='tanh'),
    layers.Dense(50, activation='tanh'),
    layers.Dropout(0.1),  # Slightly lower dropout
    layers.Dense(20, activation='tanh'),
    layers.Dense(len(LABELS), activation='softmax')
])
model.summary()

## Model activation

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test),
                    callbacks=[early_stopping, reduce_lr])

## Model evaluation

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Loss: {loss:.4f}")

# Testing model

## load weights and the Model itself

In [17]:
weights_folder = r"C:\Users\thaim\OneDrive\Desktop\Tal_Projects\Gas_detector\General_Codes\Gas_detector\model_weights\model_tanh_250-9_dropout-02_LR-0001.weights.h5"
model.load_weights(weights_folder)

In [None]:
model2 = models.Sequential([
    layers.Dense(250, activation='tanh', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.2),  # Slightly lower dropout
    layers.Dense(180, activation='tanh'),
    layers.Dropout(0.2),
    layers.Dense(100, activation='tanh'),
    layers.Dense(50, activation='tanh'),
    layers.Dropout(0.1),  # Slightly lower dropout
    layers.Dense(20, activation='tanh'),
    layers.Dense(len(LABELS))
])

model2.summary()

## Reading Test Data for Inference

In [109]:
# Read the X_axis values from a CSV file and creating reference vector variable of the spectrum needed
predict_test = pd.read_csv(r"C:\Users\thaim\OneDrive\Desktop\Tal_Projects\Gas_detector\General_Codes\Gas_Detector_5_24\Records\TEST-mix_for_model.csv", header=None)
vector_predict_test = predict_test.iloc[0, :].values  # numpy array of values


## Visualization of Input Data

In [None]:

TEST=(vector_predict_test)
plt.figure(num=None, figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
plt.plot(TEST)


## Performing Predictions

In [None]:


TEST=np.expand_dims(TEST, axis=0)
print("Test Input Shape:", TEST.shape)

# Perform inference to get predicted probabilities
probabilities = model2.predict(TEST)

## Plotting the Probability Distribution

In [None]:

# Extracting probabilities for a specific sample (e.g., the first sample in the test set)
sample_index = 0
sample_probabilities = probabilities[sample_index]

# Plotting the probability distribution with numerical labels
plt.figure(figsize=(10, 5))
bars = plt.bar(LABELS, sample_probabilities)
plt.xlabel('Gas Type')
plt.ylabel('Probability')
plt.title('Probability Distribution for Sample')
plt.xticks(rotation=45)

# Adding numerical labels to the bars
for bar, prob in zip(bars, sample_probabilities):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, f'{prob:.2f}', ha='center', va='bottom')

plt.show()


## Batch Prediction & Confusion Matrix

In [None]:
Aset = X_test
Bset = y_test

# Use a suitable batch size
batch_size = 32  # Adjust as needed based on your system's memory capacity

output = []
for i in range(0, len(Aset), batch_size):
    batch = Aset[i:i+batch_size]
    predictions = model.predict(batch, verbose=0)
    predicted_labels = np.argmax(predictions, axis=1)
    output.extend(predicted_labels)

output = np.array(output)

# Compute the confusion matrix
cm = metrics.confusion_matrix(Bset, output)
df_cm = pd.DataFrame(cm, range(np.max(Bset) + 1), range(np.max(Bset) + 1))

# Display the confusion matrix using seaborn

plt.figure(figsize=(10,7))
sn.heatmap(df_cm, annot=True, fmt='g')  # Annotate cells with numbers
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()


## Testing with Excel Data

In [None]:
path=r"C:\Users\thaim\OneDrive\Desktop\Tal_Projects\Gas cells check\Amonia 6000\05_03.xlsx"

xsls_files = pd.read_excel(path)
t=np.array(xsls_files)
test_input=(t[3,647:958]/100).astype('float32')
#
figure(num=None, figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
plt.plot(test_input)

index=10
#X_test[index,:]
input1=np.expand_dims(test_input, axis=0)
output=model.predict(input1)
output


LABELS[np.argmax(output)]
LABELS[y_test[index]]

# full code

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras import models, layers

# Folder path for CSV files
folder_path = r"C:\Users\thaim\OneDrive\Desktop\Tal_Projects\Gas_detector\UV\Code\code_files\UV Spectrum\Data train\All_train_files"
LABELS = ['Ammonia', 'Benzene', 'H2S', 'Sulfur', 'Ozone', 'Toluene', 'Xylene', 'Regular', 'noise']

# Load and concatenate all CSV files into one DataFrame
labels = []
all_data = []
inconsistent_data_count = 0

for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        print(f"Processing file: {filename}, shape: {df.shape}")

        # Drop completely blank rows and columns
        df.dropna(how='all', axis=0, inplace=True)
        df.dropna(how='all', axis=1, inplace=True)

        # Determine the gas type based on the filename
        gas_type = None
        for i, gas in enumerate(LABELS):
            if gas in filename:
                gas_type = i
                break

        if gas_type is not None:
            # Check if the file has one column or multiple columns
            if df.shape[1] < 3:
                # Single column file: add the entire column as one example
                if len(df) == 311:
                    all_data.append(df.iloc[:, 0].values.tolist())
                    labels.append(gas_type)
                else:
                    print(f"Skipping file {filename} due to incorrect length: {len(df)}")
            else:  # Multiple columns file
                for row_idx, row in df.iterrows():
                    if len(row) == 311:
                        all_data.append(row.tolist())
                        labels.append(gas_type)
                        print(f"Processed row {row_idx + 1} in multi-column file: {filename}")
                    else:
                        print(f"Skipping row {row_idx + 1} in {filename} due to incorrect length: {len(row)}")
                        inconsistent_data_count += 1

# Check all data entries for consistent length
if all(len(d) == 311 for d in all_data):
    X = np.array(all_data)
    y = np.array(labels)
    print("Data consistency check passed.")
else:
    print("Data consistency check failed. There are inconsistent entries.")

print(f"Total inconsistent data entries: {inconsistent_data_count}")

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

# Output shapes
print("X shape:", X.shape)
print("y shape:", y.shape)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Neural Network Model
model = models.Sequential([
    layers.Dense(250, activation='tanh', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.2),
    layers.Dense(180, activation='tanh'),
    layers.Dropout(0.2),
    layers.Dense(100, activation='tanh'),
    layers.Dense(50, activation='tanh'),
    layers.Dropout(0.1),
    layers.Dense(20, activation='tanh'),
    layers.Dense(len(LABELS), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Loss: {loss:.4f}")
