In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
import umap

ModuleNotFoundError: No module named 'umap'

In [None]:
data = pd.read_csv('filtered_data.csv')
data.head()

## Training Word2Vec Model

In [None]:
seq_list = list(data['seq'])
len(seq_list)

In [None]:
def kmerize_sequence(sequence, k):
    """ Convert a sequence into k-mers of specified length. """
    return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]

In [None]:
k = 6  
kmerized_sequences = [' '.join(kmerize_sequence(seq, k)) for seq in seq_list]

In [None]:
tokenized_sequences = [seq.split() for seq in kmerized_sequences]

In [None]:
model = Word2Vec(sentences=tokenized_sequences, vector_size=100, window=6, min_count=1, workers=4)

In [None]:
model.save("word2vec_promoter.model")

## Generating Embeddings

In [None]:
def sequence_embedding(sequence, model):
    k_mers = sequence.split()
    embeddings = [model.wv[k_mer] for k_mer in k_mers if k_mer in model.wv]
    return np.mean(embeddings, axis=0)

sequence_embeddings = [sequence_embedding(seq, model) for seq in kmerized_sequences]

In [None]:
len(sequence_embeddings)

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Assuming `sequence_embeddings` is your list of embeddings
X = np.array(sequence_embeddings)  # Convert to a numpy array

# Initialize and fit t-SNE
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)

# Plot
plt.figure(figsize=(12, 8))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], alpha=0.5)
plt.title('t-SNE visualization of Sequence Embeddings')
plt.xlabel('t-SNE feature 1')
plt.ylabel('t-SNE feature 2')
plt.show()

In [None]:
import umap
import matplotlib.pyplot as plt

# Assuming `sequence_embeddings` is your list of embeddings
X = np.array(sequence_embeddings)  # Convert to a numpy array

# Initialize and fit UMAP
reducer = umap.UMAP(random_state=42)
X_umap = reducer.fit_transform(X)

# Plot
plt.figure(figsize=(12, 8))
plt.scatter(X_umap[:, 0], X_umap[:, 1], alpha=0.5)
plt.title('UMAP visualization of Sequence Embeddings')
plt.xlabel('UMAP feature 1')
plt.ylabel('UMAP feature 2')
plt.show()