In [3]:
import torch
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.decomposition import PCA #PCA'd results are pushed into t-SNE algorithm


In [4]:
# 1. Load the data from the .pt file.
# Specify the file path to your data file (e.g., "your_data.pt")
file_path = r"/gpfs/proj1/choe_lab/tanu/Genus_classification/features_extracted/3_genus/train/cricket_data_feature_extracted.pt"

# Load the data using torch.load() and assign it to a variable
data = torch.load(file_path) # Load your data, assuming it contains both spectrograms and labels


In [5]:
print(type(data))
print(len(data))
print(data[0])
print(data[0]["array"].shape)

<class 'list'>
1557
{'array': array([[-1.2775939 , -1.2775939 , -1.2775939 , ..., -0.23541215,
        -0.30382976, -0.3235951 ],
       [-1.2775939 , -1.2775939 , -1.2775939 , ..., -0.23931436,
        -0.2634499 , -0.21648456],
       [-1.2775939 , -1.2775939 , -1.2775939 , ..., -0.23391277,
        -0.33280575, -0.38075528],
       ...,
       [ 0.46703237,  0.46703237,  0.46703237, ...,  0.46703237,
         0.46703237,  0.46703237],
       [ 0.46703237,  0.46703237,  0.46703237, ...,  0.46703237,
         0.46703237,  0.46703237],
       [ 0.46703237,  0.46703237,  0.46703237, ...,  0.46703237,
         0.46703237,  0.46703237]], dtype=float32), 'label': 'Cycloptilum albocircum SINA'}
(1024, 128)


In [6]:
spectrograms = np.array([sample['array'].flatten() for sample in data])
labels = [sample['label'].split()[0] for sample in data]

print(labels[0])

Cycloptilum


In [9]:
# A dictionary to keep the count of each species samples
species_sample_count = {}
for label in labels:
    if label not in species_sample_count:
        species_sample_count[label] = 1
    else:
        species_sample_count[label] += 1

species_sample_count = dict(sorted(species_sample_count.items(), key = lambda item: item[1]))
print(len(species_sample_count))
print(species_sample_count)

3
{'Cycloptilum': 211, 'Gryllus': 650, 'Oecanthus': 696}


In [17]:
#Apply PCA

In [10]:
#3 genus
num_components = 1000 ##reduce to 1000 dimensions. Original dimension is very high (131072).
pca = PCA(n_components = num_components)
spec_pca_1000 = pca.fit_transform(spectrograms)
print("TOTAL EXPLAINED VARIANCE WITH " + str(num_components)
+ " components = " +  str(sum(pca.explained_variance_ratio_)))

TOTAL EXPLAINED VARIANCE WITH 1000 components = 0.9850547965907026


In [18]:
#5 genus
num_components = 1000 ##reduce to 1000 dimensions. Original dimension is very high (131072)
pca = PCA(n_components = num_components)
spec_pca_1000 = pca.fit_transform(spectrograms)
print("TOTAL EXPLAINED VARIANCE WITH " + str(num_components)
+ " components = " +  str(sum(pca.explained_variance_ratio_)))

TOTAL EXPLAINED VARIANCE WITH 1000 components = 0.9735084373496647


In [19]:
#11 genus
num_components = 1000 ##reduce to 1000 dimensions. Original dimension is very high (131072)
pca = PCA(n_components = num_components)
spec_pca_1000 = pca.fit_transform(spectrograms)
print("TOTAL EXPLAINED VARIANCE WITH " + str(num_components)
+ " components = " +  str(sum(pca.explained_variance_ratio_)))

TOTAL EXPLAINED VARIANCE WITH 1000 components = 0.9735097053999198


In [22]:
#TSNE plot for PCA to 1000 dimensions

In [11]:
spec_tsne_1000 = TSNE(
    n_components=2, perplexity=30.0, ##feel free to tune hyperparameters
    n_iter=5000, random_state = 42).fit_transform(spec_pca_1000)

In [12]:
def tsneData_with_labels(spec_tsne):
  data_with_labels = []
  for i in range(len(spec_tsne)):
      data_point = spec_tsne[i]
      label = labels[i]  # Get the label from the original list of labels
      data_with_labels.append((data_point, label))

  return data_with_labels


In [13]:
#for 5 genus
data_with_labels_5 = tsneData_with_labels(spec_tsne_1000)
print(len(data_with_labels_5))
print(data_with_labels_5[0])

1557
(array([ -2.1294165, -14.627619 ], dtype=float32), 'Cycloptilum')


In [14]:
import matplotlib.pyplot as plt
import numpy as np
import os

# Separate data and labels
data_points5, class_labels5 = zip(*data_with_labels_5)

# Convert to numpy array for easier manipulation
data_points5 = np.array(data_points5)
class_labels5 = np.array(class_labels5)

# Get unique class labels
unique_labels5 = np.unique(class_labels5)

# Define colors for each class
class_colors = plt.cm.rainbow(np.linspace(0, 1, len(unique_labels5)))

buffer = 20

# Find overall axis limits for consistency
min_x, max_x = np.min(data_points5[:, 0]) - buffer, np.max(data_points5[:, 0]) + buffer
min_y, max_y = np.min(data_points5[:, 1]) - buffer, np.max(data_points5[:, 1]) + buffer

# Plot with all classes together
plt.figure(figsize=(10, 8))
for i, label in enumerate(unique_labels5):
    class_data = data_points5[class_labels5 == label]
    plt.scatter(class_data[:, 0], class_data[:, 1], color=class_colors[i], label=label, s= 10)

# Set overall axis limits
plt.xlim(min_x, max_x)
plt.ylim(min_y, max_y)

# Add labels and legend
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.title('t-SNE Plot for 3 Classes')
plt.legend()

# Save the overall plot
output_directory = "/gpfs/proj1/choe_lab/tanu/Genus_classification/T_SNE_plots/3_genus/"
overall_plot_path = os.path.join(output_directory, 'overall_plot_3genus.png')
plt.savefig(overall_plot_path)
plt.close()

#plt.show()

# Create individual plots for each class with consistent axis limits
for i, label in enumerate(unique_labels5):
    # Select data points for the current class
    class_data = data_points5[class_labels5 == label]

    # Create a new figure for each class
    plt.figure()

    # Plot the t-SNE coordinates for the current class with the specified color
    plt.scatter(class_data[:, 0], class_data[:, 1], color=class_colors[i], label=label, s= 10)

    # Set consistent axis limits
    plt.xlim(min_x, max_x)
    plt.ylim(min_y, max_y)

    # Add labels and title
    plt.xlabel('t-SNE Dimension 1')
    plt.ylabel('t-SNE Dimension 2')
    plt.title(f't-SNE Plot for Class: {label}')

    # Add legend
    plt.legend()

    # Save the individual plot
    individual_plot_path = os.path.join(output_directory, f'individual_plot_{label}.png')
    plt.savefig(individual_plot_path)
    plt.close()



# Show the plots
plt.show()