In [None]:
import pandas as pd
import requests
from io import StringIO

# GitHub上的文件URL
url = 'https://raw.githubusercontent.com/dhh2024/disc/main/data/work/samples/cmw_comments_sample_1.tsv'

# 从GitHub下载文件
response = requests.get(url)
response.raise_for_status()  # 确保请求成功

# 将下载的内容读取为Pandas DataFrame
data = pd.read_csv(StringIO(response.text), sep='\t')

# 显示数据的前几行以确认载入成功
print(data.head())


  subreddit_id     subreddit            id  \
0      4855832  changemyview  2.663555e+10   
1      4855832  changemyview  2.663557e+10   
2      4855832  changemyview  2.663558e+10   
3      4855832  changemyview  2.663562e+10   
4      4855832  changemyview  2.663564e+10   

                                           permalink     link_id  \
0  https://www.reddit.com/r/changemyview/comments...  75326877.0   
1  https://www.reddit.com/r/changemyview/comments...  75326877.0   
2  https://www.reddit.com/r/changemyview/comments...  75326877.0   
3  https://www.reddit.com/r/changemyview/comments...  75326877.0   
4  https://www.reddit.com/r/changemyview/comments...  75326877.0   

   parent_comment_id          created_utc  author_id      author  \
0                NaN  2013-02-19 23:49:18        NaN    rhydeble   
1       2.663555e+10  2013-02-20 00:14:30        NaN  dichotomie   
2                NaN  2013-02-20 00:21:35        NaN   [deleted]   
3       2.663558e+10  2013-02-20 01:01:47 

In [None]:
import spacy
import string

# 安装spaCy
!pip install spacy
!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

# 定义文本预处理函数
def preprocess_text(text):
    # 检查文本是否为字符串类型
    if not isinstance(text, str):
        text = ""  # 如果不是字符串，用空字符串替换
    # 创建一个文档对象
    doc = nlp(text)
    # 去除停用词和标点符号，同时进行词形还原
    cleaned_text = ' '.join(token.lemma_ for token in doc if not token.is_stop and not token.is_punct)
    return cleaned_text

# 应用预处理函数到body列
data['cleaned_body'] = data['body'].apply(preprocess_text)

# 显示数据的前几行以确认处理成功
print(data[['body', 'cleaned_body']].head())

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
                                                body  \
0  You might want to ask when someone is "relativ...   
1  This is good point but I would say overall peo...   
2  I like where you are going, but, may I suggest...   
3  I agree with you on decriminalization/legaliza...   
4                                          [deleted]   

    

In [None]:
!pip install gensim nltk
import nltk
from gensim import corpora, models
from gensim.utils import simple_preprocess
nltk.download('stopwords')

nlp = spacy.load("en_core_web_sm")

# 增加自定义停用词
custom_stopwords = {'people', 'like', 'think', 'thing', 'wiki', 'changemyview', 'r', 'comment'}
for word in custom_stopwords:
    nlp.Defaults.stop_words.add(word)

# 将文本分词，并剔除自定义停用词
def tokenize(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.lemma_.isalpha()]

data['tokens'] = data['cleaned_body'].apply(tokenize)

# 为LDA建模创建词典和语料库
dictionary = corpora.Dictionary(data['tokens'])
corpus = [dictionary.doc2bow(text) for text in data['tokens']]

# 训练LDA模型
lda_model = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes=10, random_state=100)

# 打印出来的主题
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(0, '0.012*"woman" + 0.012*"man" + 0.009*"want" + 0.008*"know" + 0.007*"time"')
(1, '0.050*"r" + 0.032*"post" + 0.027*"appeal" + 0.027*"remove" + 0.025*"message"')
(2, '0.010*"right" + 0.007*"point" + 0.007*"believe" + 0.007*"mean" + 0.006*"good"')
(3, '0.016*"country" + 0.015*"american" + 0.013*"America" + 0.012*"vote" + 0.011*"Trump"')
(4, '0.012*"work" + 0.012*"money" + 0.011*"pay" + 0.011*"system" + 0.009*"school"')


In [None]:
# 获取每个文档最主要的主题
top_documents_per_topic = lda_model.get_document_topics(corpus, minimum_probability=0.0)

# 创建一个空的列表用于存储每个主题的最符合的文档
topic_documents = {i: [] for i in range(5)}  # 假设有5个主题

# 遍历每个文档及其主题分布
for i, doc_topics in enumerate(top_documents_per_topic):
    # 找到最主要的主题和对应概率
    doc_topics = sorted(doc_topics, key=lambda x: x[1], reverse=True)
    main_topic, prob = doc_topics[0]

    # 将文档索引和概率添加到对应主题的列表中
    topic_documents[main_topic].append((i, prob))

# 对于每个主题，找出概率最高的10篇文档
for topic, doc_list in topic_documents.items():
    # 按概率排序，并选出前10篇
    doc_list.sort(key=lambda x: x[1], reverse=True)
    top_docs = doc_list[:10]

    print(f"Topic {topic}:")
    for doc_index, prob in top_docs:
        print(f"Prob: {prob:.4f} | Document: {data.iloc[doc_index]['body']}")
    print("\n")


Topic 0:
Prob: 0.9933 | Document: &gt; maladaptive trait

There are quite a lot of [biological reasons why homosexuality may be positive or at least neutral on a population level](http://www.bbc.com/news/magazine-26089486).

The "gay uncle hypothesis"suggests that while gay people may not pass on their *own* genes, they actually increase the likelihood that the children of their closely related relatives will survive.  So having a certain percentage of an animal community be gay is actually beneficial for the community.  You've got extra adults who are able to bring in food and care for children.

It's also possible that the genes that cause homosexuality are actually beneficial to *heterosexuals*.  Women tend to prefer men with somewhat feminine behavior and appearance over extremely masculine men, so these "femininizing" genes are preserved in the population's gene pool.  Some men end up with a little extra feminization and end up gay, while others get a *lot* of feminization and end

In [None]:
# 空的DataFrame，用于存储每个主题的顶部文档
columns = ['Topic', 'Probability', 'Document']
top_themed_comments = pd.DataFrame(columns=columns)

# 遍历每个主题的文档
for topic, doc_list in topic_documents.items():
    # 排序并取前10个
    doc_list.sort(key=lambda x: x[1], reverse=True)
    top_docs = doc_list[:10]

    # 为每个选中的文档创建一个数据行
    for doc_index, prob in top_docs:
        row = pd.DataFrame({
            'Topic': [topic],
            'Probability': [prob],
            'Document': [data.iloc[doc_index]['body']]
        })
        top_themed_comments = pd.concat([top_themed_comments, row], ignore_index=True)

# 保存到CSV文件
top_themed_comments.to_csv('top_themed_comments.csv', index=False)