In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from gensim.models import fasttext
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.datasets import make_blobs
from gensim.models import TfidfModel
import gensim.corpora as corpora

In [3]:
df = pd.read_pickle("df_04.pkl")
pretrained_model_path = '../../../models/wva/cc.de.300.bin'

tokenized_corpus = df['Words'].tolist()

model = fasttext.load_facebook_model(pretrained_model_path)
model.build_vocab(tokenized_corpus, update=True)
model.train(tokenized_corpus, total_examples=len(tokenized_corpus), epochs=model.epochs)

fine_tuned_model_path = '../../../models/wva/04/fine-tuned-model-04-04.bin'
model.save(fine_tuned_model_path)

In [4]:
df = pd.read_pickle("df_04.pkl")

In [1]:
from gensim.models import FastText

model = FastText.load('../../../models/wva/04/fine-tuned-model-04-04.bin')

<h1>Model Info: Vocabulary size: 2001998</h1>

In [11]:
data = df['ProcessedText']

def gen_words(texts):
    final = []
    for text in texts:
        new = text.split()  # Split the text into words
        final.append(new)
    return final

data_words = gen_words(data)

In [12]:
flat_list_of_words = [word.lower() for sentence in data_words for word in sentence]
unique_words_in_your_data = set(flat_list_of_words)

<h2> 1. Finding the Most Similar Word and Associated Titles in a Dataset</h2>

In [84]:
def find_most_similar_filtered(word, model, your_vocab):
    word = word.lower()
    if word in your_vocab:
        print("Exact word found in vocabulary.")
        similarity_score = model.wv.similarity(word, word)
        print(f"Similarity score of the word with itself: {similarity_score}")
    try:
        all_similar_words = model.wv.most_similar(word, topn=50)
        for similar_word, similarity in all_similar_words:
            if similar_word.lower() in your_vocab:
                return similar_word, similarity
    except KeyError:
        return "The word is not in the model's vocabulary."

    return "No similar word found in your vocabulary."

In [4]:
def get_titles_from_similar_words(similar_word, df):
    print(f"Searching for: '{similar_word}'")
    titles = []
    similar_word_lower = similar_word.lower()

    for i in range(len(df)):
        # Convert the text in 'ProcessedText' to lowercase before searching
        if similar_word_lower in df.loc[i, 'ProcessedText'].lower():
            title = df.loc[i, 'Title']
            titles.append(title)

    if not titles:
        print(f"No titles found for word: {similar_word}")

    return titles

In [67]:
word_to_check = input("Enter a word to find its most similar word: ")
similar_word, similarity_score = find_most_similar_filtered(word_to_check, model, unique_words_in_your_data)

print(f"Most similar word: {similar_word}, similarity score: {similarity_score}\n")

if similar_word not in ["No similar word found in your vocabulary.", f"'{word_to_check}' is not in the vocabulary."]:
    titles = get_titles_from_similar_words(similar_word, df)
    print(f"\nTitles associated with the '{similar_word}':")
    print('\n'.join(titles))

Enter a word to find its most similar word:  finanzmodelle


Most similar word: anwendungsmöglichkeiten, similarity score: 0.9947077631950378

Searching for: 'anwendungsmöglichkeiten'

Titles associated with the 'anwendungsmöglichkeiten':
Modulkatalog Applied Computer Science Bachelor 
Modulkatalog Ingenieurwissenschaften Bachelor
Modulkatalog Informatik Master
Modulkatalog Energieeffizienz Techischer System Master
Modulkatalog Digitale Medien Bachelor
Modulkatalog Informatik Bachelor
Modulkatalog Medieninformatik Master


In [76]:
word_to_check = input("Enter a word to find its most similar word: ")
similar_word, similarity_score = find_most_similar_filtered(word_to_check, model, unique_words_in_your_data)

print(f"Most similar word: {similar_word}, similarity score: {similarity_score}\n")

if similar_word not in ["No similar word found in your vocabulary.", f"'{word_to_check}' is not in the vocabulary."]:
    titles = get_titles_from_similar_words(similar_word, df)
    print(f"\nTitles associated with the '{similar_word}':")
    print('\n'.join(titles))

Enter a word to find its most similar word:  bayesian


Most similar word: bayessche, similarity score: 0.6543185114860535

Searching for: 'bayessche'

Titles associated with the 'bayessche':
Modulkatalog Medizininformatik Bachelor
Modulkatalog Informatik Master


In [107]:
word_to_check = input("Enter a word to find its most similar word: ")
similar_word, similarity_score = find_most_similar_filtered(word_to_check, model, unique_words_in_your_data)

print(f"Most similar word: {similar_word}, similarity score: {similarity_score}\n")

if similar_word not in ["No similar word found in your vocabulary.", f"'{word_to_check}' is not in the vocabulary."]:
    titles = get_titles_from_similar_words(similar_word, df)
    print(f"\nTitles associated with the '{similar_word}':")
    print('\n'.join(titles))

Enter a word to find its most similar word:  luftraum


Most similar word: energiebedarf, similarity score: 0.7440962791442871

Searching for: 'energiebedarf'

Titles associated with the 'energiebedarf':
Modulkatalog Maschinenbau Bachelor
Modulkatalog Wirtschaftsingenieurwesen Bachelor
Modulkatalog Energieeffizienz Techischer System Master
Modulkatalog Medieninformatik Master


<h2>4. Semantic Clustering of Words Using K-Means</h2>

In [5]:
words = df['Words']

flat_list_of_words = [word.lower() for sentence in words for word in sentence]
unique_words_in_your_data = set(flat_list_of_words)

In [6]:
id2word = corpora.Dictionary(words)

# Convert documents to vectors (corpus)
corpus = [id2word.doc2bow(text) for text in words]

# Create a TF-IDF model
tfidf = TfidfModel(corpus, id2word=id2word)

# Define the threshold for low-value words
low_value = 0.03

# Initialize lists to track words
words = []
words_missing_in_tfidf = []

# Iterate through each document in the corpus
for i in range(len(corpus)):
    bow = corpus[i]
    
    # Identify low-value words (below the threshold)
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    
    # Update the list of words to remove
    for item in low_value_words:
        words.append(id2word[item])
    
    # Identify words with a TF-IDF score of 0
    tfidf_ids = set(id for id, value in tfidf[bow])
    bow_ids = set(id for id, value in bow)
    words_missing_in_tfidf = [id2word[id] for id in bow_ids if id not in tfidf_ids]

    # Combine lists of words to remove
    drops = set(low_value_words + words_missing_in_tfidf)

    # Create a new bow for the document, excluding the words to remove
    new_bow = [b for b in bow if b[0] not in drops]
    corpus[i] = new_bow

In [7]:
word_ids = [word_id for doc in corpus for (word_id, value) in doc]

unique_words_after_tfidf = set([id2word[word_id] for word_id in word_ids])

In [None]:
# Retrieve vectors for each word in the vocabulary
word_vectors = [model.wv[word] for word in unique_words_after_tfidf if word in model.wv]

# Corresponding words for each vector
words = [word for word in unique_words_after_tfidf if word in model.wv]

# Apply K-means clustering
num_clusters = 10  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(word_vectors)

# Getting the cluster labels for each word vector
labels = kmeans.labels_

# Grouping words by their clusters
word_clusters = {}
for word, cluster in zip(words, labels):
    if cluster not in word_clusters:
        word_clusters[cluster] = []
    word_clusters[cluster].append(word)

# Display the words in each cluster
for cluster in range(num_clusters):
    print(f"Cluster {cluster}: {word_clusters[cluster]}")

In [None]:
file_path = 'word_clusters.txt'
with open(file_path, 'w') as f:
    for cluster, words in word_clusters.items():
        f.write(f'Cluster {cluster}: {", ".join(words)}\n')

<h3>PCA Plots</h3>

In [None]:
# Reduce word vectors to 2D using PCA for visualization
pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(word_vectors)

# Plotting each word in the 2D space
plt.figure(figsize=(15, 10))
colors = plt.cm.get_cmap('viridis', num_clusters)

for i, word in enumerate(words):
    plt.scatter(reduced_vectors[i, 0], reduced_vectors[i, 1], color=colors(labels[i]), label=labels[i])
    plt.annotate(word, xy=(reduced_vectors[i, 0], reduced_vectors[i, 1]), xytext=(5, 2),
                 textcoords='offset points', ha='right', va='bottom')

# Adding a legend and titles
plt.title('Word Clusters Visualized in 2D using PCA')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid(True)
plt.show()
plt.savefig('word_clusters.png', dpi=300)  # Save as PNG with high dpi

In [None]:
plt.figure(figsize=(15, 10))
colors = plt.cm.get_cmap('tab10', num_clusters)  # Using tab10 colormap for better contrast

# Plot the centroids
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=100, color='black', label='Centroids')

# Plot each word in the 2D space with semi-transparency
for i, word in enumerate(words):
    plt.scatter(reduced_vectors[i, 0], reduced_vectors[i, 1],
                color=colors(labels[i]), alpha=0.7, s=50)  # Semi-transparent and larger dots

# You could either annotate just the centroids or remove annotations to reduce clutter
# for i, word in enumerate(words):
#     plt.annotate(word, xy=(reduced_vectors[i, 0], reduced_vectors[i, 1]), xytext=(5, 2),
#                  textcoords='offset points', ha='right', va='bottom')

plt.title('Word Clusters Visualized in 2D using PCA')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid(True)
# plt.legend()  # Uncomment if the legend is useful, otherwise leave it out to reduce clutter
plt.show()

plt.savefig('word_clusters_without_word_and_legend.png', dpi=300)  # Save as PNG with high dpi

In [None]:
plt.figure(figsize=(15, 10))

# Create a color map based on the number of clusters
colors = plt.cm.get_cmap('tab10', num_clusters)

# Plot each cluster with a unique color and add a label for the legend
for cluster_num in range(num_clusters):
    # Find the indices of the points in this cluster
    cluster_indices = np.where(labels == cluster_num)[0]
    # Select the points that belong to the current cluster
    cluster_points = reduced_vectors[cluster_indices]
    # Plot the points with the cluster-specific color and label
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], 
                color=colors(cluster_num), 
                alpha=0.7, s=50, 
                label=f'Cluster {cluster_num}')

plt.title('Word Clusters Visualized in 2D using PCA')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid(True)
plt.legend()  # Display the legend
plt.savefig('word_clusters_without_word_with_legend.png')
plt.show()

In [None]:
import plotly.express as px
pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(word_vectors)

# Create a DataFrame for the plot
df = pd.DataFrame(reduced_vectors, columns=['PCA1', 'PCA2'])
df['word'] = words
df['cluster'] = labels

# Plot using Plotly
fig = px.scatter(df, x='PCA1', y='PCA2', color='cluster', text='word', title='Word Clusters Visualized in 2D using PCA')
fig.update_traces(textposition='top center')
fig.update_layout(showlegend=True)
fig.update_layout(legend_title_text='Cluster')

# Save the interactive plot as an HTML file
fig.write_html('word_clusters.html')