In [None]:
import pandas as pd

df = pd.read_csv('data/courses.csv')

# Function to count words in a string
def word_count(text):
    if pd.isna(text):
        return 0
    return len(str(text).split())

# Group by year and calculate word count for each column
word_counts = df.groupby('Year').apply(lambda x: x.applymap(word_count).sum())

print(word_counts)

In [None]:
# Filter out columns with size 1
filtered_word_counts = word_counts.loc[:, (word_counts != 1).any(axis=0)]

# Calculate the average word count for each year
average_word_counts = filtered_word_counts.mean(axis=1)

print(average_word_counts)

In [None]:
overall_average_word_count = average_word_counts.mean()
print(f"Overall Average Word Count: {overall_average_word_count}")

In [None]:
# seaborn visualization library
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

# plot average word count per year using seaborn
sns.set_style("whitegrid")
matplotlib.rcParams.update({'font.size': 14})
plt.figure(figsize=(16, 6))
plt.plot(average_word_counts.index, average_word_counts.values, marker='o')
plt.axhline(y=overall_average_word_count, color='r', linestyle='--', label='Overall Average Word Count')
plt.xlabel('Year')
plt.ylabel('Average Word Count')
plt.title('Average Word Count per Year')
plt.legend()

# save the plot
plt.savefig('img/average_word_count_per_year.png')

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Transpose the filtered_word_counts dataframe to have courses as rows and years as columns
transposed_word_counts = filtered_word_counts.T

# Calculate the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(transposed_word_counts)

# Convert the cosine similarity matrix to a DataFrame for better readability
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=transposed_word_counts.index, columns=transposed_word_counts.index)

print(cosine_sim_df)