In [None]:
import torch
print("GPU is available:", torch.cuda.is_available())


Embeddings for policy articles

In [None]:
pip install bertopic

In [None]:
from bertopic import BERTopic
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

# Set up stop words
stop_words = stopwords.words("english")

# Configure vectorizer and load data
vectorizer_model = CountVectorizer(stop_words=stop_words, ngram_range=(1, 2))
data = pd.read_json('/content/ecolex_filtered_articles_with_dates.json')

In [None]:
# Flatten the 'articles' column to extract nested fields
articles_flattened = pd.json_normalize(data["articles"])

# Verify columns in the flattened data
print(articles_flattened.columns)

# Ensure the 'abstract' field exists
if 'abstract' in articles_flattened.columns:
    # Filter rows with non-null abstracts
    articles_with_abstracts = articles_flattened[articles_flattened["abstract"].notna()]
else:
    raise KeyError("The 'abstract' column is missing in the articles data.")

In [None]:
# Initialize embedding model and BERTopic
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
topic_model = BERTopic(embedding_model=embedding_model, vectorizer_model=vectorizer_model)

In [None]:
# Fit-transform and extract topics and probabilities
topics, probabilities = topic_model.fit_transform(articles_with_abstracts['abstract'].tolist())

In [None]:
# Add topics and probabilities to the DataFrame
articles_with_abstracts['topic'] = topics
articles_with_abstracts['probability'] = probabilities

# Get topic information and merge with the DataFrame
topic_info = topic_model.get_topic_info()
topic_info.rename(columns={'Topic': 'topic'}, inplace=True)
articles_with_abstracts = articles_with_abstracts.merge(topic_info[['topic', 'Name']], on='topic', how='left')

In [None]:
# Generate embeddings for the abstracts and add them to the DataFrame
embeddings = embedding_model.encode(articles_with_abstracts['abstract'].tolist(), show_progress_bar=True)
articles_with_abstracts['embedding'] = embeddings.tolist()

# Save the DataFrame to a JSON file
articles_with_abstracts.to_json('ecolex_filtered_bertopic_with_embeddings.json',
                                orient='records',
                                indent=4)
# Download the file
from google.colab import files
files.download('ecolex_filtered_bertopic_with_embeddings.json')


Embeddings for scientific articles

In [None]:
from bertopic import BERTopic
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

In [None]:
# Set up stop words
stop_words = stopwords.words("english")

# Configure vectorizer and load data
vectorizer_model = CountVectorizer(stop_words=stop_words, ngram_range=(1, 2))
cdf_subs = pd.read_json('/content/conservation_filtered.json')


In [None]:
# Initialize embedding model and BERTopic
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
topic_model = BERTopic(embedding_model=embedding_model, vectorizer_model=vectorizer_model)

In [None]:
# Fit-transform and extract topics and probabilities
topics, probabilities = topic_model.fit_transform(cdf_subs['abstract'].tolist())

In [None]:
# Add topics and probabilities to the DataFrame
cdf_subs['topic'] = topics
cdf_subs['probability'] = probabilities

In [None]:
# Get topic information and merge with the DataFrame
topic_info = topic_model.get_topic_info()
topic_info.rename(columns={'Topic': 'topic'}, inplace=True)
cdf_subs = cdf_subs.merge(topic_info[['topic', 'Name']], on='topic', how='left')

In [None]:
# Generate embeddings for the abstracts and add them to the DataFrame
embeddings = embedding_model.encode(cdf_subs['abstract'].tolist(), show_progress_bar=True)
cdf_subs['embedding'] = embeddings.tolist()

cdf_subs.to_json('conservation_filtered_bertopic_with_embeddings.json',
                 orient='records',
                 indent=4)

# Download the file
from google.colab import files
files.download('conservation_filtered_bertopic_with_embeddings.json')