In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [None]:
def average_word_embeddings(df, column, word_embeddings):
    embeddings = []
    for document in df[column]:
        for word in document.split():
            if word in word_embeddings:
                embeddings.append(word_embeddings[word])
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros_like(word_embeddings.vector_size)

In [None]:
def tfidf_weighted_embeddings(df, column, word_embeddings):
    vectorizer = TfidfVectorizer()
    tfidf_vectors = vectorizer.fit_transform(df[column])
    feature_names = vectorizer.get_feature_names()
    
    weighted_embeddings = []
    for i, document in enumerate(df[column]):
        tfidf_values = tfidf_vectors[i].toarray()[0]
        embeddings = []
        for j, word in enumerate(document.split()):
            if word in word_embeddings:
                word_embedding = word_embeddings[word]
                tfidf_weight = tfidf_values[j]
                weighted_embedding = word_embedding * tfidf_weight
                embeddings.append(weighted_embedding)
        if len(embeddings) > 0:
            document_embedding = np.mean(embeddings, axis=0)
        else:
            document_embedding = np.zeros_like(word_embeddings.vector_size)
        weighted_embeddings.append(document_embedding)
    
    return weighted_embeddings

In [None]:
def weighted_word_averaging(df, column, word_weights, word_embeddings):
    embeddings = []
    for i, document in enumerate(df[column]):
        for j, word in enumerate(document.split()):
            if word in word_embeddings:
                word_embedding = word_embeddings[word]
                word_weight = word_weights[i][j]
                weighted_embedding = word_embedding * word_weight
                embeddings.append(weighted_embedding)
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros_like(word_embeddings.vector_size)

In [None]:
def train_doc2vec(df, column):
    tagged_documents = [TaggedDocument(words=document.split(), tags=[i]) for i, document in enumerate(df[column])]
    model = Doc2Vec(tagged_documents, vector_size=100, window=5, min_count=1, epochs=10)
    return model

In [None]:
word2vec_model = Word2Vec.load('path_to_word2vec_model')

df = pd.read_csv('your_data.csv')

df['average_embeddings'] = df.apply(lambda row: average_word_embeddings(row, 'text', word2vec_model.wv), axis=1)
df['tfidf_weighted_embeddings'] = df.apply(lambda row: tfidf_weighted_embeddings(row, 'text', word2vec_model.wv), axis=1)
df['weighted_average'] = df.apply(lambda row: weighted_word_averaging(row, 'text', your_word_weights, word2vec_model.wv), axis=1)
df['doc2vec_embeddings'] = df.apply(lambda row: train_doc2vec(row, 'text').infer_vector(row['text']), axis=1)

df.to_csv('phamacovigil_output_data.csv', index=False)