In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from docx import Document
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from keybert import KeyBERT
import gensim
from gensim import corpora

# --- Download NLTK data 
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

# --- Global Variables & Initializations ---
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
doc_path = r"Syllabi Policies for AI Generative Tools.docx"

# --- Helper Functions ---
def extract_policies_from_docx(doc_path):
    doc = Document(doc_path)
    all_text = []
    for para in doc.paragraphs:
        if para.text.strip():
            all_text.append(para.text.strip())
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                if cell.text.strip():
                    all_text.append(cell.text.strip())
    return all_text

def clean_text_for_tfidf_keybert(text):
    text = text.lower()  
    text = re.sub(r'[^\w\s]', '', text)  
    text = re.sub(r'\d+', '', text) 
    
    tokens = nltk.word_tokenize(text) 
    
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    return " ".join(lemmatized_tokens)

def clean_and_tokenize_for_lda(text):
    text = text.lower()  
    text = re.sub(r'[^\w\s]', '', text)  
    text = re.sub(r'\d+', '', text) 
    
    tokens = nltk.word_tokenize(text) 
    
    return [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and len(token) > 2]

# TF-IDF Analysis 
print("--- Running Method 1: TF-IDF Analysis ---")
all_text_raw = extract_policies_from_docx(doc_path)
cleaned_blocks_tfidf = [clean_text_for_tfidf_keybert(block) for block in all_text_raw]

vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=2, max_df=0.9)
tfidf_matrix = vectorizer.fit_transform(cleaned_blocks_tfidf)
tfidf_scores = tfidf_matrix.sum(axis=0).A1
terms = vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame({'term': terms, 'score': tfidf_scores})
tfidf_df = tfidf_df.sort_values(by='score', ascending=False).reset_index(drop=True)

print("Top 20 most important keywords from TF-IDF:")
print(tfidf_df.head(20))

# KeyBERT Analysis 
print("\n--- Running Method 2: KeyBERT Analysis ---")
kw_model = KeyBERT()
policy_text_combined = " ".join(cleaned_blocks_tfidf)

keybert_results = kw_model.extract_keywords(policy_text_combined,
                                          keyphrase_ngram_range=(1, 2),
                                          stop_words='english',
                                          use_mmr=True,
                                          diversity=0.7,
                                          top_n=50) # Extract more candidates
keybert_df = pd.DataFrame(keybert_results, columns=['term', 'score'])
print("Top 20 most important keywords from KeyBERT:")
print(keybert_df.head(20))

# LDA Topic Modeling
print("\n--- Running Method 3: LDA Topic Modeling ---")
# LDA requires a list of token lists, so we use a different cleaner
tokenized_docs = [clean_and_tokenize_for_lda(block) for block in all_text_raw]

# Create Dictionary and Corpus
dictionary = corpora.Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# Build LDA model - num_topics is a parameter you can tune. Let's start with 6.
num_topics = 6
lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15, random_state=42)

# Get topics and format them into a DataFrame
topics = lda_model.print_topics(num_words=10)
lda_results = []
for i, topic in enumerate(topics):
    # Parse the string output from gensim
    terms = re.findall(r'"(.*?)"', topic[1])
    for term in terms:
        lda_results.append({'topic_id': i, 'term': term})

lda_df = pd.DataFrame(lda_results)

print(f"Discovered {num_topics} topics. Keywords for each topic:")
print(lda_df)

#  Combine and Export All Results to Excel
print("\n--- Saving Results to Separate CSV Files ---")


tfidf_csv_path = "tfidf_keywords.csv"
keybert_csv_path = "keybert_keywords.csv"
lda_csv_path = "lda_topic_keywords.csv"

tfidf_df.head(100).to_csv(tfidf_csv_path, index=False)
keybert_df.head(100).to_csv(keybert_csv_path, index=False)
lda_df.to_csv(lda_csv_path, index=False)

print("Successfully saved results to CSV:")
print(f"- {tfidf_csv_path}")
print(f"- {keybert_csv_path}")
print(f"- {lda_csv_path}")


print(f"Successfully saved all keyword analysis results to '{output_excel_path}'")


--- Running Method 1: TF-IDF Analysis ---
Top 20 most important keywords from TF-IDF:
             term      score
0              ai  22.733130
1             use  13.454727
2            tool  12.402703
3            work   8.554997
4         ai tool   8.382823
5         writing   7.994337
6      generative   7.634102
7         student   7.321167
8      assignment   7.315885
9   generative ai   6.840661
10         course   6.819969
11         use ai   6.639366
12       academic   6.478842
13        chatgpt   6.205947
14         policy   6.063316
15           used   5.903841
16            may   5.855267
17           must   5.073493
18          using   4.870352
19          class   4.677365

--- Running Method 2: KeyBERT Analysis ---
Top 20 most important keywords from KeyBERT:
                                                 term   score
0                                          student ai  0.5461
1                                   essay constitutes  0.3613
2   httpswwwnewcastleeduaucurr