In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans

# Assuming you have a CSV file named 'data.csv' with a column named 'text' containing the text data

# Step 1: Load the CSV data
from google.colab import files
uploaded = files.upload()
import io
data = pd.read_csv(io.BytesIO(uploaded['lemmatized_reddit_og.csv']))
data.info()


Saving lemmatized_reddit_og.csv to lemmatized_reddit_og.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61226 entries, 0 to 61225
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  61226 non-null  int64 
 1   text        61226 non-null  object
 2   subreddit   61226 non-null  object
 3   bucket      61226 non-null  object
 4   annotator   61226 non-null  object
 5   annotation  61226 non-null  object
 6   confidence  61184 non-null  object
 7   lemmatized  61223 non-null  object
dtypes: int64(1), object(7)
memory usage: 3.7+ MB


In [5]:
# Step 2: Preprocessing
documents = data['text'].tolist()
labels = data['annotation'].tolist()

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

In [6]:

# Step 3: LDA Model Training
num_topics = 10  # Define the number of topics
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_model.fit(X)

In [8]:
# Step 4: Get the topics and their top words
feature_names = vectorizer.get_feature_names_out()
num_top_words = 10  # Define the number of top words to retrieve for each topic
topics = []

for topic_idx, topic in enumerate(lda_model.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
    topics.append((topic_idx, top_words))


In [9]:
# Step 5: Perform text classification and cluster molarity
kmeans = KMeans(n_clusters=2)  # Assuming you want to cluster into two molarity categories
kmeans.fit(X)
molarity_labels = kmeans.labels_




In [10]:

# Step 6: Map molarity labels to "Moral" and "Non-Moral"
label_mapping = {0: "Non-Moral", 1: "Moral"}
molarity_labels_mapped = [label_mapping[label] for label in molarity_labels]


In [11]:

# Step 7: Output the results
for topic in topics:
    print(f"Topic {topic[0]}: {' '.join(topic[1])}")

print("Clustered Molarity:")
for i, molarity in enumerate(molarity_labels_mapped):
    print(f"Document {i + 1}: {molarity}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Document 56227: Moral
Document 56228: Non-Moral
Document 56229: Moral
Document 56230: Moral
Document 56231: Moral
Document 56232: Moral
Document 56233: Moral
Document 56234: Moral
Document 56235: Moral
Document 56236: Moral
Document 56237: Non-Moral
Document 56238: Non-Moral
Document 56239: Moral
Document 56240: Moral
Document 56241: Moral
Document 56242: Moral
Document 56243: Moral
Document 56244: Non-Moral
Document 56245: Non-Moral
Document 56246: Moral
Document 56247: Moral
Document 56248: Non-Moral
Document 56249: Moral
Document 56250: Moral
Document 56251: Non-Moral
Document 56252: Moral
Document 56253: Moral
Document 56254: Moral
Document 56255: Moral
Document 56256: Moral
Document 56257: Moral
Document 56258: Moral
Document 56259: Moral
Document 56260: Moral
Document 56261: Moral
Document 56262: Non-Moral
Document 56263: Moral
Document 56264: Non-Moral
Document 56265: Moral
Document 56266: Moral
Document 56267: Mor