In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

df = pd.read_csv('data/courses_with_30keywords.csv')

# Function to extract keywords from a DataFrame column
def extract_keywords(column):
    keywords = []
    for item in column.dropna().values:
        try:
            keyword_list = ast.literal_eval(item)
            keywords.extend(keyword_list)
        except (ValueError, SyntaxError):
            continue
    return keywords

course_a = extract_keywords(df['Software Testing_keywords'])
course_b = extract_keywords(df['Software Specification, Verification and Testing_keywords'])

# Combine all keywords into one list
all_keywords = course_a + course_b

# Check the cleaned keywords
print(all_keywords)

# Initialize the SBERT model
model = SentenceTransformer('all-mpnet-base-v2')

# Generate embeddings for all keywords
keyword_embeddings = model.encode(all_keywords)

# Apply PCA to reduce dimensionality to 2D for visualization
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(keyword_embeddings)

labels = [0] * len(course_a) + [1] * len(course_b)

# Plot the PCA reduced embeddings
plt.figure(figsize=(20, 10))

# Scatter plot with colors based on the course label
scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=labels, cmap='coolwarm', edgecolors='k', s=100)

# Annotate each point with the corresponding keyword
for i, keyword in enumerate(all_keywords):
    plt.annotate(keyword, (reduced_embeddings[i, 0], reduced_embeddings[i, 1]),
                 textcoords="offset points", xytext=(0, 5), ha='center')

# Add legend for course labels
handles, _ = scatter.legend_elements()
plt.legend(handles, ["Software Testing (course_a)", "Software Specification, Verification and Testing (course_b)"], title="Course")

# Plot settings
plt.title("PCA Scatter Plot of Keyword Embeddings by Course")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import os
import ast  
from wordcloud import WordCloud
import matplotlib.pyplot as plt

df = pd.read_csv('data/courses_with_30keywords.csv')

# Function to extract keywords from a DataFrame column
def extract_keywords(column):
    keywords = []
    for item in column.dropna().values:
        try:
            # Safely evaluate the string representation of a list
            keyword_list = ast.literal_eval(item)
            keywords.extend(keyword_list)
        except (ValueError, SyntaxError):
            continue
    return keywords

# Extract keywords for each course
course_a = extract_keywords(df['Software Testing_keywords'])
course_b = extract_keywords(df['Software Specification, Verification and Testing_keywords'])

# Combine keywords into a single string for each course for the word clouds
combined_keywords_a = ' '.join(course_a)
combined_keywords_b = ' '.join(course_b)

# Combine all keywords into a single string for the combined word cloud
combined_keywords = ' '.join(course_a + course_b)

# Generate the word cloud for Course A
wordcloud_a = WordCloud(width=400, height=200, background_color='white', collocations=False).generate(combined_keywords_a)

# Generate the word cloud for Course B
wordcloud_b = WordCloud(width=400, height=200, background_color='white', collocations=False).generate(combined_keywords_b)

# Generate the combined word cloud
combined_wordcloud = WordCloud(width=400, height=200, background_color='white', collocations=False).generate(combined_keywords)

# Create a single figure for all word clouds
plt.figure(figsize=(12, 8))

# Plot the word cloud for Course A
plt.subplot(2, 2, 1)  
plt.imshow(wordcloud_a, interpolation='bilinear')
plt.axis('off')  # Hide axes
plt.title("Word Cloud of Keywords from Software Testing")

# Plot the word cloud for Course B
plt.subplot(2, 2, 2)  
plt.imshow(wordcloud_b, interpolation='bilinear')
plt.axis('off')  # Hide axes
plt.title("Word Cloud of Keywords from Software Specification, Verification and Testing")

# Plot the combined word cloud
plt.subplot(2, 1, 2)  
plt.imshow(combined_wordcloud, interpolation='bilinear')
plt.axis('off')  # Hide axes
plt.title("Combined Word Cloud of Keywords from Both Courses")s

# Adjust layout for better spacing
plt.tight_layout()

img_folder = 'img' 
wordcloud_filepath = os.path.join(img_folder, 'ssvt-testing-wordcloud.png')
plt.savefig(wordcloud_filepath, bbox_inches='tight', dpi=300)  
plt.close()  

print(f"PCA plot saved to {wordcloud_filepath}")
