In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

c_df = pd.read_csv('data/courses_combined.csv')

# Initialize the SentenceTransformer model
model = SentenceTransformer('all-mpnet-base-v2')

# Generate embeddings for the combined course texts
year_embeddings = model.encode(c_df['combined'].tolist())

# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(year_embeddings)

# Convert to DataFrame for better visualization
similarity_df = pd.DataFrame(similarity_matrix, index=c_df['Year'], columns=c_df['Year'])

# Plotting the heatmap
plt.figure(figsize=(16, 9))
sns.heatmap(similarity_df, annot=True, cmap='coolwarm', fmt=".2f", square=True, cbar_kws={"shrink": .8})
plt.title('Yearly Semantic Similarity Heatmap')
plt.xlabel('Year')
plt.ylabel('Year')
plt.xticks(rotation=90)
plt.yticks()
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns


if c_df['Year'].dtype == object: 
    c_df['Year'] = c_df['Year'].apply(lambda x: int(x.split('-')[0]) if '-' in x else int(x))

# Prepare data for regression
years = np.array(c_df['Year']) 
similarity_scores = similarity_df.values.flatten() 

# Create a DataFrame for regression
regression_data = pd.DataFrame({
    'Year': np.repeat(years, len(years)), 
    'Similarity': similarity_scores  
})

# Remove NaN values if any
regression_data.dropna(inplace=True)

# Features and targets
X = regression_data[['Year']]  #
y = regression_data['Similarity']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Create a range of years for prediction, up to 2030
min_year = regression_data['Year'].min()
max_year = 2030  
future_years = pd.DataFrame({'Year': np.arange(min_year, max_year + 1)})  

# Predict for all years in the range
predicted_similarities = model.predict(future_years)

# Create a DataFrame to hold the predictions
predictions_df = pd.DataFrame({
    'Year': future_years['Year'],
    'Predicted Similarity': predicted_similarities
})

print(predictions_df)

plt.figure(figsize=(12, 6))
sns.scatterplot(x=X['Year'], y=y, color='blue', alpha=0.5, label='Historical Data')
plt.plot(predictions_df['Year'], predictions_df['Predicted Similarity'], color='red', label='Regression Line')
plt.title('Regression Analysis of Similarity Over Years')
plt.xlabel('Year')
plt.ylabel('Similarity Score')
plt.axvline(x=2024, color='gray', linestyle='--', label='Last Year of Data') 
plt.legend()
plt.show()
