In [1]:

import pandas as pd

# Try reading the CSV file using different encodings
encodings_to_try = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']
file_path = "/content/textmining cleaned1.csv"

for encoding in encodings_to_try:
    try:
        data = pd.read_csv(file_path, encoding=encoding)
        print(f"File read successfully with encoding: {encoding}")
        break  # Stop trying other encodings if successful
    except UnicodeDecodeError:
        print(f"Could not read with encoding: {encoding}")

# Continue with your data processing once the file is successfully read
docs = data['Generative AI statement text'].astype(str)
# Further processing...


File read successfully with encoding: utf-8


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Preprocessing steps (replace text with your actual text column)
text_data = data['Generative AI statement text'].astype(str)
text_data = text_data.apply(lambda x: ' '.join(x.lower() for x in x.split()))  # Lowercasing
text_data = text_data.str.replace('[^\w\s]', '')  # Remove punctuation
text_data = text_data.str.replace('\d+', '')  # Remove numbers

# Remove specific stopwords like "ai"
stop_words = ['ai']  # Customize stopwords list as needed
text_data = text_data.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Vectorization
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
tf = vectorizer.fit_transform(text_data)

# LDA with modified parameters
no_topics = 15  # Change the number of topics here
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=20, learning_method='batch', random_state=0)
lda_z = lda.fit_transform(tf)

# Associate universities with topics and extract keywords
data['Dominant_Topic'] = lda_z.argmax(axis=1)
universities_by_topic = {i: [] for i in range(no_topics)}
keywords_by_topic = {i: [] for i in range(no_topics)}

for idx, row in data.iterrows():
    topic = row['Dominant_Topic']
    university = row['University name']
    universities_by_topic[topic].append(university)

# Get the top keywords for each topic
feature_names = vectorizer.get_feature_names_out()
topic_keywords = lda.components_
n_top_words = 10  # Number of top keywords to display

for topic_idx, topic in enumerate(topic_keywords):
    top_keywords_idx = topic.argsort()[-n_top_words:][::-1]
    keywords = [feature_names[i] for i in top_keywords_idx]
    keywords_by_topic[topic_idx] = keywords

# Display universities and keywords in each topic
for topic, universities in universities_by_topic.items():
    print(f"Topic {topic}:")
    print("Universities: ", ', '.join(universities))
    print("Keywords: ", ', '.join(keywords_by_topic[topic]))
    print("\n")

  text_data = text_data.str.replace('[^\w\s]', '')  # Remove punctuation
  text_data = text_data.str.replace('\d+', '')  # Remove numbers


Topic 0:
Universities:  Rice University, Stony Brook University (SUNY)
Keywords:  students, chatgpt, writing, faculty, student, models, questions, learning, said, community


Topic 1:
Universities:  Emory University, University of Wisconsin, Madison
Keywords:  data, tools, research, policies, information, risk, chatgpt, researchers, institutional, used


Topic 2:
Universities:  Princeton University, Duke University, University of California, Los Angeles, University of Michigan, Ann Arbor, University of California, Davis, University of Illinois Urbana-Champaign, The Ohio State University, Michigan State University, University of California, Merced, George Washington University, University of Massachusetts, Amherst, University of Pittsburgh, Villanova University, Binghamton University (SUNY), Colorado School of Mines, Pepperdine University, University of Delaware, University of Iowa, Illinois Institute of Technology
Keywords:  use, tools, students, generative, course, work, academic, wri

In [None]:
!pip install bertopic
from bertopic import BERTopic

topic_model = BERTopic(embedding_model="all-MiniLM-L12-v2", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(docs)

Collecting bertopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transfor

2023-12-11 01:53:35,839 - BERTopic - Embedding - Transforming documents to embeddings.


.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2023-12-11 01:53:51,105 - BERTopic - Embedding - Completed ✓
2023-12-11 01:53:51,107 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-12-11 01:53:58,964 - BERTopic - Dimensionality - Completed ✓
2023-12-11 01:53:58,966 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-12-11 01:53:58,979 - BERTopic - Cluster - Completed ✓
2023-12-11 01:53:58,988 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-12-11 01:53:59,107 - BERTopic - Representation - Completed ✓


In [None]:
#freq = topic_model.get_topic_info();
num_topics = 15
print(num_topics)
#freq.head(num_topics)

15


In [None]:
for i in range(num_topics):
  print("\n== Representative documents in cluster #", i)
  print(topic_model.get_representative_docs(i))


== Representative documents in cluster # 0
['Respond to Generative AI\n"As machines get better at being machines, the primary purpose of higher education must be helping humans get better at being human." \n- Randy Bass, Vice Provost for Education, Georgetown University\n\nTeaching and learning do not happen in a vacuum; educators and learners must contend with emergent technologies (e.g., Artificial Intelligence), social movements, and the evolving demands of industry. The sudden and widespread interest in Generative Artificial Intelligence like ChatGPT in late 2022 belies the fact that we have been teaching with AI for quite some time. \n\nOn this resource page, we provide some timely strategies and basic definitions, and we highlight key modes of thought and areas of skill development relevant to teaching with any technology. We also underscore some of the foundational principles of teaching and learning that bear repeating.\n\nFinally, we invite you to join the conversation. If yo

In [None]:
topic_model.visualize_distribution(probs[200], min_probability=0.015)

IndexError: ignored

In [None]:
topic_model.visualize_barchart(top_n_topics=10)

In [3]:
!pip install bertopic
from bertopic import BERTopic
from bertopic import BERTopic
import pandas as pd

# Read the dataset
data = pd.read_csv("/content/textmining cleaned1.csv")  # Replace 'your_dataset.csv' with the path to your dataset

# Extract the 'University name' and 'Generative AI statement text' columns
texts = data['Generative AI statement text'].tolist()

# Initialize BERTopic for 15 topics
model = BERTopic(nr_topics=15)

# Fit BERTopic to your text data
topics, _ = model.fit_transform(texts)

# Map university names to their corresponding topics
universities = data['University name'].tolist()
university_topics = {}
for university, topic in zip(universities, topics):
    if topic not in university_topics:
        university_topics[topic] = [university]
    else:
        university_topics[topic].append(university)

# Display universities in each topic
for topic, universities in university_topics.items():
    print(f"Topic {topic}:")
    print("Universities:", ', '.join(universities))
    print("\n")


Collecting bertopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Topic 1:
Universities: Princeton University, Stanford University, Duke University, Johns Hopkins University, Columbia University, University of California, Los Angeles, Rice University, Dartmouth College, University of Notre Dame, Georgetown University, Carnegie Mellon University, University of California, Davis, University of Southern California, Georgia Institute of Technology, University of California, Irvine, University of California, Santa Barbara, Boston College, University of Washington, Boston University, The Ohio State University, Purdue University, Texas A&M University, University of Georgia, Virginia Tech, Wake Forest University, Northeastern University, William & Mary, Stony Brook University (SUNY), University of Connecticut, Brandeis University, Michigan State University, The Pennsylvania State University, University Park, Santa Clara University, University of California, Merced, George Washington University, University of Massachusetts, Amherst, Villanova University, Bing

In [7]:
from bertopic import BERTopic
import pandas as pd

# Read the dataset
data = pd.read_csv("/content/textmining cleaned1.csv")  # Replace 'your_dataset.csv' with the path to your dataset

# Extract the 'University name' and 'Generative AI statement text' columns
texts = data['Generative AI statement text'].astype(str).tolist()
universities = data['University name'].tolist()

# Initialize BERTopic for generating 15 topics
model = BERTopic(nr_topics=15)

# Fit BERTopic to your text data
topics, _ = model.fit_transform(texts)

# Create a dictionary to store universities for each topic
universities_by_topic = {i: [] for i in range(15)}

# Associate universities with topics, ignoring outlier topic (-1)
for idx, topic in enumerate(topics):
    if topic != -1:
        universities_by_topic[topic].append(universities[idx])

# Display universities in each topic
for topic, universities in universities_by_topic.items():
    print(f"Topic {topic}:")
    print("Universities: ", ', '.join(universities))
    print("\n")


Topic 0:
Universities:  Harvard University, Yale University, University of Pennsylvania, California Institute of Technology, Brown University, Northwestern University, Cornell University, University of Chicago, University of North Carolina at Chapel Hill, Emory University, University of Virginia, Washington University in St. Louis, University of Florida, University of Texas at Austin, University of Illinois Urbana-Champaign, University of Wisconsin, Madison, Tufts University, University of Maryland, College Park, Indiana University, Bloomington, University of Illinois, Chicago


Topic 1:
Universities:  Princeton University, Stanford University, Duke University, Johns Hopkins University, Columbia University, Dartmouth College, University of Notre Dame, Carnegie Mellon University, University of Southern California, University of California, Santa Barbara, Boston University, Michigan State University, Binghamton University (SUNY), Fordham University, Temple University, Illinois Institute 

In [13]:
from bertopic import BERTopic
import pandas as pd

# Read the dataset
data = pd.read_csv("/content/textmining cleaned1.csv")  # Replace 'your_dataset.csv' with the path to your dataset

# Extract the 'University name' and 'Generative AI statement text' columns
texts = data['Generative AI statement text'].astype(str).tolist()
universities = data['University name'].tolist()

# Initialize BERTopic and chunk large texts
model = BERTopic()

chunk_size = 512  # Define the chunk size
chunked_texts = []
chunked_universities = []

for text, university in zip(texts, universities):
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    chunked_texts.extend(chunks)
    chunked_universities.extend([university]*len(chunks))

# Fit BERTopic to the chunked texts
topics, _ = model.fit_transform(chunked_texts)

# Find the optimal number of topics and assign them to a variable
optimal_topics, _ = model.find_topics(chunked_texts)
num_optimal_topics = optimal_topics[0] if optimal_topics else 0  # Extract the optimal number of topics

# Create a dictionary to store universities for each optimal topic
universities_by_topic = {i: [] for i in range(num_optimal_topics)}

# Associate universities with topics while excluding -1 labeled topics
for idx, topic in enumerate(topics):
    if topic < num_optimal_topics and topic >= 0:  # Exclude -1 labeled topics
        universities_by_topic[topic].append(chunked_universities[idx])

# Display universities in each optimal topic
for topic, universities in universities_by_topic.items():
    print(f"Optimal Topic {topic}:")
    print("Universities: ", ', '.join(universities))
    print("\n")


In [19]:
from bertopic import BERTopic
import pandas as pd

# Read the dataset
data = pd.read_csv("/content/textmining cleaned1.csv")  # Replace 'your_dataset.csv' with the path to your dataset

# Extract the 'University name' and 'Generative AI statement text' columns
texts = data['Generative AI statement text'].astype(str).tolist()
universities = data['University name'].tolist()

# Initialize BERTopic and chunk large texts
model = BERTopic()

chunk_size = 512  # Define the chunk size
chunked_texts = []
chunked_universities = []

for text, university in zip(texts, universities):
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    chunked_texts.extend(chunks)
    chunked_universities.extend([university]*len(chunks))

# Fit BERTopic to the chunked texts
topics, _ = model.fit_transform(chunked_texts)

# Find the optimal number of topics and assign them to a variable
optimal_topics, _ = model.find_topics(chunked_texts)
num_optimal_topics = optimal_topics[0] if optimal_topics else 0  # Extract the optimal number of topics

# Create a dictionary to store universities for each optimal topic
universities_by_topic = {i: [] for i in range(num_optimal_topics)}

# Associate universities with topics while excluding -1 labeled topics
for idx, topic in enumerate(topics):
    if topic < num_optimal_topics and topic >= 0:  # Exclude -1 labeled topics
        universities_by_topic[topic].append(chunked_universities[idx])

# Print universities in each optimal topic
for topic, universities in universities_by_topic.items():
    print(f"Optimal Topic {topic}:")
    print("Universities: ", ', '.join(universities))
    print("\n")
