In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler

class GCMSDataEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=16):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)

        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

class SensorDataEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=64):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)

        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# uploading gcms data
df = pd.read_csv("/content/drive/My Drive/Smell/Contrastive Learning/gcms_dataframe.csv")

# adding ambient to the df
ambient_row = pd.DataFrame([{'food_name': "ambient", 'C': 0.04, "Ca": 0, "H": 0.00005, "K": 0, "Mg": 0, "N": 78.08, "Na":0, "O": 20.95, "P": 0, "Se":0,}])

df = pd.concat([df, ambient_row], ignore_index=True)

# getting rid of names and keeping only numerical values
top_5_food = ["oregano", "cumin", "basil", "ambient"]
# top_5_food = ["coffee_beans", "oregano", "mint_leaves", "apple_juice", "cloves"]

df = df[df['food_name'].isin(top_5_food)]

df_dropped = df.drop(columns=["food_name"], errors="ignore")

gcms_data = df_dropped.values

scaler = StandardScaler()

scaler.fit(gcms_data)

gcms_data = scaler.transform(gcms_data)

available_food_names = df["food_name"].to_list()

ix_to_name = {i: name for i, name in enumerate(available_food_names)}
name_to_ix = {name: i for i, name in enumerate(available_food_names)}

In [None]:
ix_to_name

{0: 'basil', 1: 'cumin', 2: 'oregano', 3: 'ambient'}

In [None]:
available_food_names

['basil', 'cumin', 'oregano', 'ambient']

In [None]:
# loading smell sensor data
smell_data_path = "/content/drive/My Drive/Smell/demo_data"

paths = []

for file in os.listdir(smell_data_path):
    file_path = os.path.join(smell_data_path, file)
    food_name = file.split(".")[0]

    if food_name in available_food_names:
        paths.append(file_path)

In [None]:
def create_state_average_df(df):
    df['Group'] = (df['State'] != df['State'].shift()).cumsum()

    averaged_df = df.groupby('Group').mean().reset_index()

    averaged_df['State'] = df.groupby('Group')['State'].first().values

    averaged_df = averaged_df.drop(columns=['Group'])

    averaged_df = averaged_df[averaged_df["State"] < 2]
    averaged_df.reset_index(drop=True)
    return averaged_df

In [None]:
def calculate_state_difference(df):
    # Check if the first row's state is not 1 and remove it if true
    if df.iloc[0]['State'] != 1:
        df = df.iloc[1:].reset_index(drop=True)

    # Ensure the DataFrame has an even number of rows
    if len(df) % 2 != 0:
        df = df[:-1]

    # Calculate the difference between odd and even rows
    odd_rows = df.iloc[1::2].reset_index(drop=True)
    even_rows = df.iloc[0::2].reset_index(drop=True)

    result = odd_rows - even_rows
    return result

In [None]:
from collections import defaultdict
import re

ingredient_df = []

for path in paths:
    ingredient_name = re.split(r'[./]', path)[-3]

    dataframe = pd.read_csv(path)
    if dataframe.shape[1] > 14:
        dataframe = dataframe[dataframe.columns[:14]]
    dataframe.drop(columns=["timestamp", "Temperature", "Pressure", "Humidity", "Gas_Resistance", "Altitude"], inplace=True)
    dataframe.rename(columns={dataframe.columns[-1]: "State"}, inplace=True)

    diff_data = dataframe.diff(periods=50)  # This is the key change

    sensor_cols = [col for col in diff_data.columns if col not in ["State", "label"]]
    diff_data = diff_data[~(diff_data[sensor_cols] == 0).all(axis=1)]

    # Drop the first 10 rows with NaN values
    diff_data = diff_data.iloc[200:-200]

    diff_data["label"] = name_to_ix[ingredient_name]
    ingredient_df.append(diff_data)

In [None]:
combined_df = pd.concat(ingredient_df, axis=0, ignore_index=True)

In [None]:
columns_to_normalize = combined_df.columns[:13]

In [None]:
def filter_outliers(group):
    numerical_columns = group.select_dtypes(include=[np.number]).columns
    for col in numerical_columns:
        Q1 = group[col].quantile(0.2)
        Q3 = group[col].quantile(0.8)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        group = group[(group[col] >= lower_bound) & (group[col] <= upper_bound)]
    return group

# Group by 'Label' and filter outliers within each group
# filtered_groups = combined_df.groupby('label').apply(filter_outliers).reset_index(drop=True)
filtered_groups = combined_df

In [None]:
print(combined_df.groupby('label').size().reset_index(name='count'))

   label  count
0      0    809
1      1    923
2      2    910
3      3    810


In [None]:
print(filtered_groups.groupby('label').size().reset_index(name='count'))

   label  count
0      0    809
1      1    923
2      2    910
3      3    810


In [None]:
scaler = StandardScaler()

# Normalize all numerical columns (excluding 'Label' and 'State')
numerical_columns = filtered_groups.select_dtypes(include=[np.number]).columns
numerical_columns = numerical_columns.drop(['label', 'State'])  # Exclude non-feature columns

# Fit and transform the data
filtered_groups[numerical_columns] = scaler.fit_transform(filtered_groups[numerical_columns])

train_mean = scaler.mean_
train_std = np.sqrt(scaler.var_)  # scaler.var_ is variance (std²)

print("\nNormalized DataFrame:")
print(filtered_groups)


Normalized DataFrame:
           NO2    C2H50H       VOC        CO   Alcohol       LPG   Benzene  \
0    -0.632987 -0.206901 -0.478227 -1.143735 -0.022777 -0.059637 -0.114464   
1    -0.461998 -0.206901 -0.649975 -1.624165 -0.022777 -0.452514 -0.114464   
2    -0.632987 -0.437518 -0.821723 -1.624165 -0.022777 -0.452514 -0.114464   
3    -0.803976 -0.668135 -0.821723 -1.624165 -0.022777 -0.059637 -0.114464   
4    -0.632987 -0.437518 -0.649975 -1.143735 -0.022777 -0.059637 -0.114464   
...        ...       ...       ...       ...       ...       ...       ...   
3447 -2.342880 -0.437518 -2.367453 -1.143735 -0.022777  0.333240 -0.114464   
3448 -2.171890 -0.668135 -2.367453 -1.143735 -0.022777 -0.059637 -0.114464   
3449 -2.171890 -0.668135 -2.367453 -1.143735 -0.022777 -0.059637 -0.114464   
3450 -2.000901 -0.668135 -2.195705 -1.143735 -0.022777  0.333240 -0.114464   
3451 -2.000901 -0.668135 -2.195705 -1.143735 -0.022777 -0.059637 -0.114464   

      State  label  
0       0.0      3 

In [None]:
train_mean.shape

(7,)

In [None]:
train_std

array([5.84832099e+00, 4.33619372e+00, 5.82249125e+00, 2.08146843e+00,
       2.49279793e+00, 2.54532829e+00, 5.43490188e+08])

In [None]:
def select_median_representative(group, n=1):
    median_values = group.median()  # Calculate the median of each feature
    distances = np.linalg.norm(group - median_values, axis=1)  # Distance to median
    group['distance'] = distances  # Add distances as a temporary column
    closest_rows = group.nsmallest(n, 'distance').drop(columns='distance')  # Get n closest rows
    return closest_rows

In [None]:
def select_median(group):
    # Calculate the median of each feature in the group
    median_values = group.median()
    return median_values

In [None]:
label_counts = filtered_groups.groupby('label').size().reset_index(name='count')

label_counts

Unnamed: 0,label,count
0,0,809
1,1,923
2,2,910
3,3,810


In [None]:
# sampled_df = filtered_groups.groupby('label').apply(select_median)
sampled_df = filtered_groups.groupby('label').apply(lambda x: x.sample(n=300, random_state=42))

  sampled_df = filtered_groups.groupby('label').apply(lambda x: x.sample(n=300, random_state=42))


In [None]:
print(sampled_df.shape)

(1200, 9)


In [None]:
df_tuples = filtered_groups.apply(tuple, axis=1)
representatives_tuples = sampled_df.apply(tuple, axis=1)

# Get the remaining rows (testing data)
remaining_data = filtered_groups[~df_tuples.isin(representatives_tuples)]

In [None]:
remaining_data.shape

(1905, 9)

In [None]:
smell_data = sampled_df.drop(['label', 'State'], axis=1).values  # Features
y = sampled_df['label'].values  # Labels

In [None]:
gcms_data.shape

(4, 10)

In [None]:
pair_data = []

for i in range(len(smell_data)):
    pair_data.append((smell_data[i], gcms_data[int(y[i])]))

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

In [None]:
def cross_modal_contrastive_loss(z1, z2, temperature=0.07):
    """
    Contrastive loss between two batches of embeddings, z1 and z2.
    We treat (z1[i], z2[i]) as the positive pair, and all others as negatives.
    """
    # 1) L2-normalize each embedding
    z1 = F.normalize(z1, dim=1)
    z2 = F.normalize(z2, dim=1)

    batch_size = z1.size(0)

    # 2) Similarity matrix: [batch_size, batch_size]
    # each entry sim[i, j] = dot(z1[i], z2[j]) / temperature
    sim = torch.matmul(z1, z2.t()) / temperature

    # 3) For row i, the correct "label" is i (the diagonal)
    labels = torch.arange(batch_size, device=z1.device)

    # 4) Cross entropy loss
    # We'll interpret each row i of 'sim' as a distribution over j,
    # and the "correct" j is i.
    loss_12 = F.cross_entropy(sim, labels)
    loss_21 = F.cross_entropy(sim.t(), labels)
    loss = 0.5 * (loss_12 + loss_21)

    return loss

In [None]:
class PairedDataset(Dataset):
    """
    Expects 'data' to be a list (or array-like) of length N,
    where each item is (gcms_vector, smell_vector).

    Each vector could be:
      - a NumPy array of shape [feature_dim]
      - a Python list
      - etc.
    We'll just return them as Tensors.
    """
    def __init__(self, data):
        self.data = data  # data = [(gcms_vec, smell_vec), (gcms_vec, smell_vec), ...]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        gcms_vec, smell_vec = self.data[idx]

        # Convert to torch.FloatTensors (if they aren't already)
        gcms_vec = torch.tensor(gcms_vec, dtype=torch.float)
        smell_vec = torch.tensor(smell_vec, dtype=torch.float)

        return gcms_vec, smell_vec


In [None]:
dataset = PairedDataset(pair_data)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [None]:
print(gcms_data.shape)
print(smell_data.shape)

(4, 10)
(1200, 7)


In [130]:
# Example hyperparams
gcms_input_dim = gcms_data.shape[1]
sensor_input_dim = smell_data.shape[1]
embedding_dim = 16  # final output dimension
hidden_dim = 128
temperature = 0.07
num_epochs = 100

# Instantiate encoders
gcms_encoder = GCMSDataEncoder(gcms_input_dim, hidden_dim, embedding_dim)
sensor_encoder = SensorDataEncoder(sensor_input_dim, hidden_dim, embedding_dim)

# Put on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gcms_encoder.to(device)
sensor_encoder.to(device)

# Define optimizer
# We'll optimize both encoders' parameters together
params = list(gcms_encoder.parameters()) + list(sensor_encoder.parameters())

optimizer = optim.Adam(params, lr=1e-3)

# Training loop
for epoch in range(num_epochs):
    gcms_encoder.train()
    sensor_encoder.train()

    total_loss = 0.0
    for (x_sensor, x_gcms) in dataloader:
        x_gcms = x_gcms.to(device)
        x_sensor = x_sensor.to(device)

        optimizer.zero_grad()

        # Forward pass
        z_gcms = gcms_encoder(x_gcms)    # shape [batch_size, embedding_dim]
        z_sensor = sensor_encoder(x_sensor)

        # Contrastive loss
        loss = cross_modal_contrastive_loss(z_gcms, z_sensor, temperature)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")


Epoch 1/100, Loss: 3.5810
Epoch 2/100, Loss: 3.2350
Epoch 3/100, Loss: 3.1070
Epoch 4/100, Loss: 3.0748
Epoch 5/100, Loss: 3.0573
Epoch 6/100, Loss: 3.0244
Epoch 7/100, Loss: 3.0151
Epoch 8/100, Loss: 3.0334
Epoch 9/100, Loss: 3.0307
Epoch 10/100, Loss: 3.0281
Epoch 11/100, Loss: 2.9868
Epoch 12/100, Loss: 2.9739
Epoch 13/100, Loss: 2.9791
Epoch 14/100, Loss: 2.9920
Epoch 15/100, Loss: 2.9450
Epoch 16/100, Loss: 2.9601
Epoch 17/100, Loss: 2.9499
Epoch 18/100, Loss: 2.9582
Epoch 19/100, Loss: 2.9460
Epoch 20/100, Loss: 2.9413
Epoch 21/100, Loss: 2.9408
Epoch 22/100, Loss: 2.9521
Epoch 23/100, Loss: 2.9417
Epoch 24/100, Loss: 2.9382
Epoch 25/100, Loss: 2.9183
Epoch 26/100, Loss: 2.9248
Epoch 27/100, Loss: 2.9099
Epoch 28/100, Loss: 2.9351
Epoch 29/100, Loss: 2.9526
Epoch 30/100, Loss: 2.9285
Epoch 31/100, Loss: 2.9293
Epoch 32/100, Loss: 2.9170
Epoch 33/100, Loss: 2.9026
Epoch 34/100, Loss: 2.8936
Epoch 35/100, Loss: 2.8970
Epoch 36/100, Loss: 2.8970
Epoch 37/100, Loss: 2.8971
Epoch 38/1

In [131]:
import torch
import torch.nn.functional as F
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

def evaluate_retrieval(test_smell_data, test_smell_label, gcms_encoder, sensor_encoder, device='cpu'):
    """
    Evaluate how well the model matches GCMS embeddings to sensor embeddings.
    We'll compute:
      - embeddings for all GCMS data
      - embeddings for all sensor data
    Then for each GCMS embedding, we find the most similar sensor embedding
    and check if it's the correct one (same sample index).

    This returns the "retrieval accuracy" (% of rows i where argmax similarity == i).

    Parameters:
      test_loader: a DataLoader that yields (x_gcms, x_sensor) for test samples.
                   We assume each batch is aligned so sample i in both is the "same" sample.
      gcms_encoder, sensor_encoder: your trained PyTorch encoders
      device: 'cpu' or 'cuda'
    """
    gcms_encoder.eval()
    sensor_encoder.eval()

    # We'll store all embeddings in lists, then concatenate.
    all_z_gcms = []
    all_z_sensor = []

    testing_gcms_data = torch.tensor(gcms_data, dtype=torch.float).to(device)
    gcms_embeddings = gcms_encoder(testing_gcms_data)
    z_gcms = F.normalize(gcms_embeddings, dim=1)

    test_smell_data = torch.tensor(test_smell_data, dtype=torch.float).to(device)
    smell_embeddings = sensor_encoder(test_smell_data)
    z_smell = F.normalize(smell_embeddings, dim=1)

    sim = torch.matmul(z_smell, z_gcms.T)

    print(sim)

    print(f"Similarity matrix shape: {sim.shape}")

    # For each row i, find the column j with the highest similarity
    # If j == i, it means we matched the correct sensor embedding
    predicted = sim.argmax(dim=1)  # [N]

    print("------------------Predictions---------------------")
    print(predicted)

    # Compare with the "ground truth" index = i
    correct = (predicted == test_smell_label)
    accuracy = correct.float().mean().item()

    precision = precision_score(test_smell_label, predicted, average='macro')
    recall = recall_score(test_smell_label, predicted, average='macro')
    f1 = f1_score(test_smell_label, predicted, average='macro')
    conf_matrix = confusion_matrix(test_smell_label, predicted)

    print("------------------Test Statistics---------------------")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)

    return accuracy, conf_matrix



In [132]:
test_smell_data = remaining_data.drop(['label', 'State'], axis=1).values  # Features
test_y = remaining_data['label'].values  # Labels

In [133]:
accuracy, conf_matrix = evaluate_retrieval(test_smell_data, test_y, gcms_encoder, sensor_encoder, device=device)
print(f"Test retrieval accuracy: {accuracy*100:.2f}%")

tensor([[0.3472, 0.3286, 0.8805, 0.5573],
        [0.1159, 0.2759, 0.3987, 0.8745],
        [0.1983, 0.3690, 0.6046, 0.8627],
        ...,
        [0.3450, 0.3113, 0.8535, 0.1799],
        [0.3576, 0.3421, 0.8602, 0.1937],
        [0.3530, 0.3283, 0.8613, 0.2006]], grad_fn=<MmBackward0>)
Similarity matrix shape: torch.Size([1905, 4])
------------------Predictions---------------------
tensor([2, 3, 3,  ..., 2, 2, 2])
------------------Test Statistics---------------------
Accuracy: 0.8808
Precision: 0.8841
Recall: 0.8761
F1-Score: 0.8791
Confusion Matrix:
[[474   9   6   0]
 [ 22 516  47   6]
 [  0  51 403  20]
 [ 10   9  47 285]]
Test retrieval accuracy: 88.08%


In [134]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

def analyze_confusion_matrix(conf_matrix):
    num_classes = conf_matrix.shape[0]
    class_metrics = {}

    # Calculate metrics for each class
    for i in range(num_classes):
        # True Positives (TP): Correct predictions for class i
        num_predictions = np.sum(conf_matrix[i])
        TP = conf_matrix[i, i]

        # False Positives (FP): Sum of column i (excluding TP)
        FP = np.sum(conf_matrix[:, i]) - TP

        # False Negatives (FN): Sum of row i (excluding TP)
        FN = np.sum(conf_matrix[i, :]) - TP

        # True Negatives (TN): Sum of all elements except row i and column i
        TN = np.sum(conf_matrix) - (TP + FP + FN)

        # Calculate metrics
        accuracy = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) != 0 else 0
        precision = TP / num_predictions

        # Store metrics for the class
        class_metrics[available_food_names[i]] = {
            "Accuracy": precision,
        }

    return class_metrics

In [135]:
analyze_confusion_matrix(conf_matrix)

{'basil': {'Accuracy': np.float64(0.9693251533742331)},
 'cumin': {'Accuracy': np.float64(0.8730964467005076)},
 'oregano': {'Accuracy': np.float64(0.8502109704641351)},
 'ambient': {'Accuracy': np.float64(0.811965811965812)}}

In [136]:
from datetime import datetime

In [137]:
train_mean

array([2.70191194e+00, 1.89716107e+00, 3.78447277e+00, 3.80648899e-01,
       5.67786790e-02, 1.51796060e-01, 6.22098392e+07])

In [138]:
train_std

array([5.84832099e+00, 4.33619372e+00, 5.82249125e+00, 2.08146843e+00,
       2.49279793e+00, 2.54532829e+00, 5.43490188e+08])

In [139]:
gcms_model_path = f"/content/drive/My Drive/Smell/Contrastive Learning/demo_gcms_encoder_{datetime.now()}.pt"
sensor_model_path = f"/content/drive/My Drive/Smell/Contrastive Learning/demo_sensor_encoder_{datetime.now()}.pt"

torch.save(gcms_encoder.state_dict(), gcms_model_path)
torch.save(sensor_encoder.state_dict(), sensor_model_path)