In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics.pairwise import cosine_distances
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt


In [None]:


# Step 1: Load the embeddings from all three CSV files
files = ['BGE-path.csv', 
         'glove-path.csv', 
         'cbow-our_model.csv']
embeddings = []
labels = ['BGE', 'glove', 'cbow']

for file in files:
    df = pd.read_csv(file)
    embeddings.append(df)

# Combine the embeddings from all files into a single dataframe
combined_embeddings = pd.concat(embeddings)

# Step 2: Filter for climate change-related words
selected_words = [
    "climate", "change", "global", "warming", "environment", "carbon", 
    "emissions", "temperature", "weather", "policy", "regulation", 
    "mitigation", "adaptation", "sustainability", "renewable", "health", 
    "disaster", "migration", "economy", "inequality", "justice", "science", 
    "technology", "innovation", "research", "data", "modeling", "awareness", 
    "denial", "activism", "education", "media", "protest", "energy", 
    "fossil", "fuels", "oil", "coal", "natural", "gas", "solar", "wind", 
    "agreement", "paris", "kyoto", "protocol", "un", "summit", 
    "negotiations", "biodiversity", "species", "ecosystem", "habitat", 
    "extinction", "deforestation" 
]

# Ensure the selected words are in the combined embeddings
words = set(combined_embeddings['word'])
selected_words = [word for word in selected_words if word in words]

# Extract embeddings for the selected words
df_filtered = combined_embeddings[combined_embeddings['word'].isin(selected_words)]
word_embeddings = df_filtered.set_index('word').loc[selected_words].values

# Step 3: Apply PCA 
pca = PCA(n_components=2)
pca_result = pca.fit_transform(word_embeddings)

# Step 4: Visualize the results
plt.figure(figsize=(12, 10))

# Plot each word with a different color for each file
colors = ['green', 'blue', 'black']
for i, file in enumerate(files):
    label = labels[i]
    file_df = embeddings[i]
    file_filtered = file_df[file_df['word'].isin(selected_words)]
    
    try:
        file_embeddings = file_filtered.set_index('word').loc[selected_words].values

        # Apply PCA to the current file's embeddings
        pca_result_file = pca.transform(file_embeddings)
        
        for j, word in enumerate(selected_words):
            try:
                x = pca_result_file[j, 0]
                y = pca_result_file[j, 1]
                plt.scatter(x, y, color=colors[i], label=f'{label}' if j == 0 else "", alpha=0.7)
                plt.text(x, y, word, fontsize=12, color=colors[i])
            except IndexError:
                # This occurs if the word is not in the selected_words for this particular embedding
                continue
    except KeyError:
        # This occurs if the word is not found in the file's embedding dataframe
        continue

plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('Climate Change-Related Words in Embeddings from Different Models')
plt.legend(loc='upper right')
plt.grid(True)
plt.show()
