In [None]:
import pandas as pd
import numpy as np
import gensim 
import spacy
import nltk
import mlflow

import os
from datetime import datetime
import collections

In [None]:
!pwd

### Import data

In [None]:
import re

def filter_EN_content(files: [str]):
    substrings = ['FR', 'SP', 'RU', 'JP', 'ES', 'PT', 'é', r'(\d)', 'jpn']
    pattern = re.compile('|'.join(substrings))
    valid_files = [file for file in files if not pattern.search(file)]
    return valid_files

In [None]:
class Document:
    
    def __init__(self, i: int, word_file_path: str, guided_content_type: str):
        self.index = i
        self.word_file_path = word_file_path
        self.guided_content_type = guided_content_type
        self.title = os.path.splitext(os.path.basename(word_file_path))[0]
        # print(self.title)
        self.content = self._read_word_document()
        
    def _read_word_document(self):
        content = []
        with open(self.word_file_path, 'r') as f:
            for line in f.readlines():
                l = line.strip()
                if l != '':
                    content.append(l)
        return content
    

In [None]:
# --- read meditations files
meditations_dir = "/Users/emulie/Data/Meditations_CLEAN"
meditations_docs = [Document(i, os.path.join(meditations_dir, word_file), 'meditations') for i, word_file in enumerate(filter_EN_content(os.listdir(meditations_dir)))]

In [None]:
# --- read sleeptales files
sleeptales_dir = "/Users/emulie/Data/SleepTales_CLEAN"
sleeptales_docs = [Document(i, os.path.join(sleeptales_dir, word_file), 'sleeptales') for i, word_file in enumerate(filter_EN_content(os.listdir(sleeptales_dir)))]

In [None]:
from nltk.corpus import stopwords

# define corpus
def read_corpus(i, document: str):
    tokens = []
    for line in document.content:
        # remove stopwords 
        preprocessed_line = ' '.join([word for word in line.split() if word.isalnum() and word not in stop_words])
        tokens += gensim.utils.simple_preprocess(preprocessed_line)
    
    return gensim.models.doc2vec.TaggedDocument(tokens, [i])

stop_words = set(stopwords.words('english'))
train_documents = sleeptales_docs + meditations_docs
# train_documents = meditations_docs
train_corpus = [read_corpus(i, doc) for i, doc in enumerate(train_documents)]

In [None]:
print(f"Number of contents: {len(train_documents)}")

# compute average number of words per document
docs_content_length = [len(' '.join(doc.content).split(' ')) for doc in train_documents]
print(f"Average Words per content: {np.mean(docs_content_length):.2f}")

### Importing MLFlow URI

In [None]:
import os
import configparser
import ssl

# missing ssh import needed to connect to model URI
ssl._create_default_https_context = ssl._create_unverified_context

# 
# script_dir = os.path.dirname(os.path.abspath(__file__))
script_dir = os.path.dirname(os.path.abspath(""))
config_path = os.path.join(script_dir, "config.ini")
print(f"config_path: {config_path}")

# parse 
config = configparser.ConfigParser()
config.read(config_path)

# define mlflow uri
mlflow_uri = config["mlflow-server"]["Url"]
TRACKING_URI = mlflow_uri

In [None]:
mlflow.set_tracking_uri(TRACKING_URI)

In [None]:
experiment_name = 'GuidedContentEmbedding'
if not mlflow.get_experiment_by_name(name=experiment_name):
    mlflow.create_experiment(name=experiment_name)
experiment = mlflow.get_experiment_by_name(experiment_name)


### Training the model in MLFlow

Documentation:
- https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html
- https://tedboy.github.io/nlps/generated/generated/gensim.models.Doc2Vec.html

In [None]:
# --- train the model

def get_doc2vec_model(train_corpus, hyperparams: dict):
    model = gensim.models.doc2vec.Doc2Vec(vector_size=hyperparams['vector_size'], min_count=hyperparams['min_count'],
                                          epochs=hyperparams['train_epochs'], window=hyperparams['window_size'], dm=hyperparams['dm'])
    model.build_vocab(train_corpus)
    model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
    return model

In [None]:
# --- model assessment
def get_self_similarity_score(model, train_corpus):
    ranks = []
    second_ranks = []
    for doc_id in range(len(train_corpus)):
        inferred_vector = model.infer_vector(train_corpus[doc_id].words)
        train_documents[doc_id].vector = inferred_vector # add vectorization to document
        sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
        rank = [docid for docid, sim in sims].index(doc_id)
        ranks.append(rank)
    
        second_ranks.append(sims[1])

    counter = collections.Counter(ranks)
    # print(counter)

    self_similarity_score = round(counter[0]/len(ranks) * 100)
    return self_similarity_score

In [None]:
# --- visualize document embedding
from sklearn.manifold import TSNE
import plotly.express as px
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objs as go
import plotly.io as pio
pio.renderers.default = 'iframe'


def plot_embedding():
    # Get all vectors and labels
    vectors = np.array([model.dv[idx] for idx in range(len(train_corpus))])
    titles = [doc.title for doc in train_documents]
    guided_content_types = [doc.guided_content_type for doc in train_documents]

    # Apply t-SNE to reduce the dimensionality for visualization
    tsne_model = TSNE(n_components=2, random_state=42)
    tsne_vectors = tsne_model.fit_transform(vectors)

    # Create a DataFrame for easier visualization using seaborn
    df = pd.DataFrame({'X': tsne_vectors[:, 0], 'Y': tsne_vectors[:, 1], 'Title': titles, 
                       'Content Type': guided_content_types})
    # create plot
    fig = px.scatter(df, x='X', y='Y', hover_data=['Title', 'Content Type'], color='Content Type', 
                     title='t-sne visualisation of document embeddings')
    # fig.show()


    return fig

In [None]:
# --- Embedding for Meditations

hyperparams = {
    "vector_size": 300,
    "window_size": 15,
    "min_count": 5,
    # "sampling_threshold": 1e-5,
    # "negative_size": 5,
    "train_epochs": 100,
    "dm": 0,  # 0 = dbow; 1 = dmpv
    # "worker_count": 1 # number of parallel processes
}
run_name = f"Meditation&SleepTales_Doc2VecEmbedding_{datetime.now().strftime('%Y-%m-%d_%H:%M')}"
tags = {
    "env": "test",
    "data_date": datetime.now().strftime('%Y-%m-%d_%H:%M'),
    "model_type": "Doc2Vec",
    # "experiment_description": "Doc2Vec Embedding for Meditations content",
    "experiment_description": "Doc2Vec Embedding for Meditations & SleepTales content",
}

model = get_doc2vec_model(train_corpus, hyperparams)
model_metrics = {
    "self_similarity_score": get_self_similarity_score(model, train_corpus),
}
fig = plot_embedding()

In [None]:
with mlflow.start_run(experiment_id=experiment.experiment_id, run_name=run_name, tags=tags):
    # logging parameters
    for hyperparam, value in hyperparams.items():
        mlflow.log_param(hyperparam, value)

    # logging metrics
    for metric, value in model_metrics.items():
        mlflow.log_metric(metric, value)
        
    # logging plots
    fig.write_html("embedding.html")
    mlflow.log_artifact("embedding.html", artifact_path="plot")

In [None]:
print(len(model.dv))
print(len(train_corpus))

In [None]:
# --- get vector for each guided content
v = model.infer_vector(train_corpus[0].words)

In [None]:
model.dv.most_similar(v, topn=10)

### Use the model for user predictions

We get the list of all the liked/listened guided content to create a user vector and check the items closest

In [None]:
def get_user_vector(liked_content_indexes: [int]):
    vectors = np.array([train_documents[i].vector for i in liked_content_indexes])
    user_vector = np.mean(vectors, axis=0)
    return user_vector

def get_recommendations(user_vector: np.array):
    recommendations = [] # (title, proba)
    for i, proba in model.dv.most_similar([user_vector], topn=10):
        recommendations.append([train_documents[i].title, proba])
    return recommendations

    
liked_content_indexes = [np.random.randint(0, len(train_documents)) for _ in range(5)]
user_vector = get_user_vector(liked_content_indexes)
recommendations = get_recommendations(user_vector)    

### Use model for single guided recommendations content

### Read tags dataframe

In [None]:
df_tags = pd.read_csv("/Users/emulie/Data/content-en.csv")

In [None]:
df_tags.head()

In [None]:
print(len(train_corpus))
print(len(df_tags))

In [None]:
# --- filter for guided content only
is_guided_content_mask = df_tags['type'] != 'music'
df_tags = df_tags[is_guided_content_mask]

In [None]:
print(len(train_corpus))
print(len(df_tags))

In [None]:
df_tags['author'].value_counts()

In [None]:
df_tags['narratorName'].value_counts()

In [None]:
df_tags['narratorGender'].value_counts()

In [None]:
# -- get all unique tags
unique_tags = set([tag.strip() for tag in ','.join([tag for tag in list(df_tags['tags']) if str(tag) != 'nan']).split(',')])

In [None]:
dct_i_to_tags = {i: tag for i, tag in enumerate(unique_tags)}
dct_tags_to_i = {tag: i for i, tag in enumerate(unique_tags)}

In [None]:
# --- create tag matrix from df_tags
matrix_tags = np.zeros((df_tags.shape[0], len(dct_i_to_tags)))
for i, row in df_tags.iterrows():
    if str(row['tags']) == 'nan':
        continue
        
    tags = row['tags'].split(', ')
    for tag in tags:
        matrix_tags[i, dct_tags_to_i[tag]] += 1

In [None]:
# --- count number of content for each tag
df_count = pd.DataFrame(matrix_tags.sum(axis=0))
df_count.index = list(dct_tags_to_i.keys())

In [None]:
df_count

### Deeper Model Evaluation using tags

In [None]:
titles = [doc.title for doc in train_documents]

In [None]:
titles