In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from wordcloud import WordCloud
from sklearn.decomposition import LatentDirichletAllocation as LDA
import warnings

warnings.simplefilter("ignore", DeprecationWarning)

# Read and process dataset
papers = pd.read_csv("datasets/papers.csv")
papers.drop(['id', 'event_type', 'pdf_name'], axis=1, inplace=True)

# Bar plot for publications per year
counts = papers.groupby('year').size()
counts.plot(kind='bar', title="ML Publications (1987–2017)")
plt.xlabel("Year")
plt.show()

# Process titles: clean text
papers['title_processed'] = papers['title'].str.replace('[,\.!?]', '', regex=True).str.lower()

# Word Cloud
wc = WordCloud()
wc.generate(' '.join(papers['title_processed']))
wc.to_image().show()

# Common Words
def plot_top_words(data, vectorizer):
    words = vectorizer.get_feature_names_out()
    total_counts = np.array(data.sum(axis=0)).flatten()
    top_words = sorted(zip(words, total_counts), key=lambda x: -x[1])[:10]

    plt.bar(*zip(*top_words))
    plt.xticks(rotation=90)
    plt.title('10 Most Common Words')
    plt.show()

count_vec = CountVectorizer(stop_words='english')
count_data = count_vec.fit_transform(papers['title_processed'])
plot_top_words(count_data, count_vec)

# LDA Topics
def print_topics(lda_model, vectorizer, n_words=10):
    words = vectorizer.get_feature_names_out()
    for i, topic in enumerate(lda_model.components_):
        print(f"\nTopic {i}: " + " ".join([words[j] for j in topic.argsort()[-n_words:][::-1]]))

lda = LDA(n_components=10)
lda.fit(count_data)

print("LDA Topics:")
print_topics(lda, count_vec)
