In [None]:
import os
import pandas as pd
import numpy as np

import collections
from scipy.spatial.distance import minkowski

What is in this notebook:
- Creating a mix for a guided content by finding N closest sound to vectorized guided content (if above given threshold)
- Use the sounds tags (?)

### Import Guided Content Verbatim Data

In [None]:
import re

def filter_EN_content(files: [str]):
    substrings = ['FR', 'SP', 'RU', 'JP', 'ES', 'PT', 'é', r'(\d)', 'jpn', 'Portugese', 'MX', 'Spanish', 'JA', 'GHI', 'ST']
    pattern = re.compile('|'.join(substrings))
    valid_files = [file for file in files if not pattern.search(file)]
    return valid_files

class Document:
    
    def __init__(self, i: int, word_file_path: str, guided_content_type: str):
        self.index = i
        self.word_file_path = word_file_path
        self.guided_content_type = guided_content_type
        self.title = os.path.splitext(os.path.basename(word_file_path))[0]
        # print(self.title)
        self.content = self._read_word_document()
        
    def _read_word_document(self):
        content = []
        with open(self.word_file_path, 'r') as f:
            for line in f.readlines():
                l = line.strip()
                if l != '':
                    content.append(l)
        return content
    

In [None]:
# --- read meditations files
meditations_dir = "/Users/emulie/Data/Meditations_CLEAN"
meditations_docs = [Document(i, os.path.join(meditations_dir, word_file), 'meditations') for i, word_file in enumerate(filter_EN_content(os.listdir(meditations_dir)))]
# --- read sleeptales files
sleeptales_dir = "/Users/emulie/Data/SleepTales_CLEAN"
sleeptales_docs = [Document(i, os.path.join(sleeptales_dir, word_file), 'sleeptales') for i, word_file in enumerate(filter_EN_content(os.listdir(sleeptales_dir)))]

In [None]:
from nltk.corpus import stopwords
import gensim

# define corpus
def read_corpus(i, document: str):
    tokens = []
    for line in document.content:
        # remove stopwords 
        preprocessed_line = ' '.join([word for word in line.split() if word.isalnum() and word not in stop_words])
        tokens += gensim.utils.simple_preprocess(preprocessed_line)
    
    return gensim.models.doc2vec.TaggedDocument(tokens, [i])

stop_words = set(stopwords.words('english'))
train_documents = sleeptales_docs + meditations_docs
# train_documents = meditations_docs
train_corpus = [read_corpus(i, doc) for i, doc in enumerate(train_documents)]

In [None]:
print(f"Number of contents: {len(train_documents)}")

# compute average number of words per document
docs_content_length = [len(' '.join(doc.content).split(' ')) for doc in train_documents]
print(f"Average Words per content: {np.mean(docs_content_length):.2f}")

In [None]:
titles = [doc.title for doc in train_documents]

In [None]:
titles[:5]

### Import the sounds data

In [None]:
SOUNDS_PATH = "/Users/emulie/Downloads/sounds_db/Ambient Sounds 3c130430a27048578933d92bfca60113_all.csv"
df_sounds = pd.read_csv(SOUNDS_PATH)

In [None]:
df_sounds.head()

In [None]:
print(f"Num sounds: {len(df_sounds['Unique ID'].unique())}")

### Train the model

In [None]:
# --- train the model

def get_doc2vec_model(train_corpus, hyperparams: dict):
    model = gensim.models.doc2vec.Doc2Vec(vector_size=hyperparams['vector_size'], min_count=hyperparams['min_count'],
                                          epochs=hyperparams['train_epochs'], window=hyperparams['window_size'], dm=hyperparams['dm'])
    model.build_vocab(train_corpus)
    model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
    return model

In [None]:
# --- model assessment
def get_self_similarity_score(model, train_corpus):
    ranks = []
    second_ranks = []
    for doc_id in range(len(train_corpus)):
        inferred_vector = model.infer_vector(train_corpus[doc_id].words)
        train_documents[doc_id].vector = inferred_vector # add vectorization to document
        sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
        rank = [docid for docid, sim in sims].index(doc_id)
        ranks.append(rank)
    
        second_ranks.append(sims[1])

    counter = collections.Counter(ranks)
    # print(counter)

    self_similarity_score = round(counter[0]/len(ranks) * 100)
    return self_similarity_score

In [None]:
hyperparams = {
    "vector_size": 300,
    "window_size": 15,
    "min_count": 5,
    # "sampling_threshold": 1e-5,
    # "negative_size": 5,
    "train_epochs": 100,
    "dm": 0,  # 0 = dbow; 1 = dmpv
    # "worker_count": 1 # number of parallel processes
}
model = get_doc2vec_model(train_corpus, hyperparams)

In [None]:
self_similarity_score = get_self_similarity_score(model, train_corpus)

### Vectorize the sounds using the model

In [None]:
# --- vectorized sounds (sound_id, vectorized_sounds)
dct_vectorized_sound = {row['Unique ID']: model.infer_vector([row['Name']]) for _, row in df_sounds.iterrows()}


### Find the closest sounds to a given guided content


In [None]:
def get_closest_sounds_from_content(content_idx: int):
    # --- compute distance for each sounds 
    guided_vec = train_documents[content_idx].vector

    # --- sort and get the closest
    distances = np.array([minkowski(sound_vec, guided_vec) for sound_vec in list(dct_vectorized_sound.values())])
    

In [None]:
content_idx = 19
guided_vec = train_documents[content_idx].vector
distances = np.array([minkowski(sound_vec, guided_vec) for sound_vec in list(dct_vectorized_sound.values())])

In [None]:
num_sounds = len(df_sounds)
indexes = np.argpartition(distances, num_sounds-1)[-num_sounds:]

In [None]:
distances

Since all distances are about the same, the embedding with text method is not a good one.