In [1]:
import re
import jieba.posseg as pseg
from functions import *

In [2]:
import emoji

In [3]:
import pandas as pd

In [4]:
import stanza
CORENLP_SERVER_URL = 'http://localhost:9000'
client = stanza.Pipeline('zh', tokenize_pretokenized=True, processors='tokenize, pos,lemma, depparse')

2025-04-09 20:34:42 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2025-04-09 20:34:44 INFO: Downloaded file to /Users/xiangningxu/stanza_resources/resources.json
2025-04-09 20:34:44 INFO: "zh" is an alias for "zh-hans"
2025-04-09 20:34:47 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package          |
--------------------------------
| tokenize  | gsdsimp          |
| pos       | gsdsimp_charlm   |
| lemma     | gsdsimp_nocharlm |
| depparse  | gsdsimp_charlm   |

2025-04-09 20:34:47 INFO: Using device: cpu
2025-04-09 20:34:47 INFO: Loading: tokenize
2025-04-09 20:34:47 INFO: Loading: pos
2025-04-09 20:34:49 INFO: Loading: lemma
2025-04-09 20:34:49 INFO: Loading: depparse
2025-04-09 20:34:49 INFO: Done loading processors!


In [5]:
def parse_dependencies(text):
    """
    Parse dependencies using Stanford CoreNLP for Chinese.
    """
    if not text.strip():
        return None
    doc = client(text)
    dependencies = []
    for sentence in doc.sentences:
        for word in sentence.words:
            dependencies.append({
                "word": word.text,
                "head": word.head,  # Index of the head word (0 if root)
                "relation": word.deprel,  # Dependency relation
                "governor": sentence.words[word.head - 1].text if word.head > 0 else "ROOT"
            })
    return dependencies

In [6]:
custom_words_file = 'custom_words.txt'
add_words_from_file(custom_words_file)

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/13/k9tvrh6d47bcwm8lgp_tklmw0000gn/T/jieba.cache
Loading model cost 0.728 seconds.
Prefix dict has been built successfully.


In [7]:
def tokenize(text):
    tokens = [word.word for word in pseg.cut(text)]#if word.flag!='m'
    return ' '.join(tokens)

In [8]:
def convert_emoji(sentence):
    sentence = emoji.demojize(sentence)
    sentence = re.sub(r":cow_face::horse_face:", "牛马", sentence)
    sentence = re.sub(r'\[[^\[\]]*\]','', sentence)
    return sentence

In [9]:
def preprocess(sentence):
    #remove @ or 回复@ handle
    sentence = re.sub(r'回复|[\u4e00-\u9fff]@[^:]*:','', sentence)
    sentence = re.sub('@[^:]* ','', sentence)
    
    sentence = re.sub(r'[\s]', '，',sentence)
    sentence = re.sub(r'[\n\t\s]*', '', sentence)
    sentence = re.sub(r'我觉得', '',sentence)
    sentence = re.sub(r'[“+”（）]','', sentence)
    sentence = re.sub(r'%','百分之', sentence)
    return sentence

In [10]:
def split2chunks(sentence):
    chunks = re.split(r'[。！？?，, （）\[\];:、……]', sentence) #split by punctuations
    chunks = [chunk.strip() for chunk in chunks if chunk.strip()] #remove any empties
    chunks = [chunk for chunk in chunks if len(chunk)>2]
    return chunks

sentence = "对于孩子，我有义务给他一个健康快乐的童年，而不是和我一样充满压抑和对未来都是恐惧的童年。但是，我清楚的知道我目前没用这个能力，也没这个底气，我对未来一直都是抱着一种消极的态度，并且虽然没用检查过，但是时不时情绪大波动的我应该是有一点心理疾病。表现在耐心不足上，可对孩子最需要的就是耐心。所以我对家里催的态度说不。尽管他们说我没有孩子以后会多不好，但是我宁愿只是自己不好，毕竟孩子没法自己选择家庭不是吗？"

sentence = convert_emoji(sentence)

sentence = preprocess(sentence)

sentence

chunks = split2chunks(sentence)

print(chunks)

tokens = [tokenize(text) for text in chunks]

tokens

refine_n_revise(tokens)

In [11]:
def refine_n_revise(tokens):
    dep_token = []
    for token in tokens:
        #prepare the list for results
        new_token = []
        
        #initialize the substitution and removal lists
        sub_dict = {}
        remove_list = []
        
        #label dependencies with parser
        label_deps = parse_dependencies(token)
        
        #identify root
        try:
            root = next(dep['word'] for dep in label_deps if dep['relation']=='root')
        except TypeError:
            print('type error at comment', token)
            continue
            

        #identify subjects
        subject = [dep['word'] for dep in label_deps if dep['relation']=='nsubj']
        
        
        for dep in label_deps:
            #孩子相关
            child = {'孩子','小孩','宝宝','小孩儿','崽','崽崽','男孩','女孩','崽子','儿子','一个人','宠物','猫','狗'}
            verb = {'给','有','生','没','抱','带','养','爱','不要','要'}
            if dep['relation']=='obj' and dep['word'] in child and dep['governor'] in verb :
                dep['word'] = dep['governor']+dep['word']
                remove_list.append(dep['governor'])

            #connect root to its negation
            if dep['relation'] == 'advmod' and dep['word'] in {'不','没'} and len(dep['governor'])<=3:
                sub_dict.update([(dep['governor'], dep['word']+dep['governor'])])
                remove_list.append(dep['word'])
            
            #connect object to its negating adjective
            if dep['relation'] == 'obj' and dep['governor'] in {'没'} and len(dep['word'])<=3:
                sub_dict.update([(dep['word'], dep['governor']+dep['word'])])
                remove_list.append(dep['governor'])
                
            #connect verb to a different negation
            if dep['word']=='不了' and len(dep['governor'])<=2:
                sub_dict.update([(dep['governor'], dep['governor']+'不了')])
                remove_list.append('不了')
                
            #connect very to a different negation
            if dep['word']=='不好' and dep['governor'] in {'教','带','养','照顾','教育','做','当'}:
                sub_dict.update([(dep['governor'],dep['governor']+'不好')])
                remove_list.append('不好')

            #connect relatives to determiner
            relatives = {'爸爸','妈妈','父母','爸妈','父亲','母亲','亲妈','爹'}
            if dep['word'] in {'我'} and dep['governor'] in relatives:
                sub_dict.update([(dep['governor'], dep['word'] + '的' + dep['governor'])])

            #if root has aux, keep it (lets deal with it like this for now)
            if dep['governor']==root and dep['relation'] in {'aux','nmod','nmod:tmod','advcl'}:
                dep['relation']='xcomp'

            #if subject has compount, keep it
            if dep['governor'] in subject and dep['relation']=='compound':
                dep['relation']='xcomp'

        #take the designated types of dependencies and put them in results
        filterbylist = {'root','nsubj','nsubj:pass', 'obj','xcomp','csubj','amod'}
        filtered_deps = [dep for dep in label_deps if dep['relation'] in filterbylist]
        new_token.extend(list(set(dep['governor'] for dep in filtered_deps if dep['governor']!="ROOT")))
        new_token.extend([dep['word'] for dep in filtered_deps if dep['word']!=root and dep['word'] not in new_token])
        
        #if the root happens to get left out, put it in
        if root not in new_token:
            new_token.append(root)
        
        # substitute key terms and remove redundants
        if sub_dict!={}:
            new_token = [sub_dict.get(token, token) for token in new_token]

        if remove_list!=[]:
            new_token = [token for token in new_token if token not in remove_list]
        dep_token.append(new_token)
    
    return [item for sublist in dep_token for item in sublist]

In [12]:
the_doc = pd.read_csv("oid323836485_simplified.csv")
the_doc = the_doc[the_doc.vote==2]

In [13]:
comments = [comment for comment in the_doc.content]

In [14]:
comments_tokens = []
for comment in comments:
    sentence = preprocess(comment)
    sentence = convert_emoji(sentence)
    chunks = split2chunks(sentence)
    tokens = [tokenize(text) for text in chunks]
    new_tokens = refine_n_revise(tokens)
    comments_tokens.append(new_tokens)

In [15]:
len(comments_tokens)

1943

In [16]:
replacement_rules = {}
add_replacements('replacement_rules2.txt', replacement_rules)

{'我的妈妈': '我妈',
 '我的妈': '我妈',
 '我的母亲': '我妈',
 '我的亲妈': '我妈',
 '老妈': '我妈',
 '妈': '妈妈',
 '爸爸妈妈': '爸妈',
 '父母': '爸妈',
 '爹妈': '爸妈',
 '我的爸妈': '我的父母',
 '我的父亲': '我爸',
 '我的爸': '我爸',
 '父亲': '爸爸',
 '爸': '爸爸',
 '爹': '爸爸',
 'ta': '我的孩子',
 '小孩': '小孩子',
 '小孩儿': '小孩子',
 '小朋友': '小孩子',
 '崽': '小孩子',
 '崽崽': '小孩子',
 '娃': '孩子',
 '宝宝': '小孩子',
 '娃娃': '小孩子',
 '生小孩': '生孩子',
 '生娃': '生孩子',
 '带娃': '带孩子',
 '要孩子': '生孩子',
 '有小孩': '有孩子',
 '有娃': '有孩子',
 '给小孩': '给孩子',
 '男孩儿': '男孩',
 '女孩儿': '女孩',
 '小女孩': '女孩',
 '小男孩': '男孩',
 '我的小孩': '我的孩子',
 '养蛙': '养孩子',
 '男娃': '男孩',
 '男孩子': '男孩',
 '女娃': '女孩',
 '女孩子': '女孩',
 '娃儿': '小孩子',
 '家里': '家庭',
 '生': '生孩子',
 '生子': '生孩子',
 '结了婚': '结婚',
 '小时候': '童年',
 '女朋友': '女友',
 '女生': '女人',
 '男生': '男人',
 '闺女': '女儿',
 '不生孩子': '不生',
 '男': '男人',
 '女': '女人',
 '害怕': '怕'}

In [17]:
megatoken = {}
add_replacements('megatoken.txt', megatoken)

{'车贷': '车与房',
 '房贷': '车与房',
 '没房': '车与房',
 '没车': '车与房',
 '房子': '车与房',
 '租房': '车与房',
 '车子': '车与房',
 '买车': '车与房',
 '有车有房': '车与房',
 '车': '车与房',
 '房': '车与房',
 '买房子': '车与房',
 '房车': '车与房',
 '揭不开锅': '穷',
 '再穷': '穷',
 '穷苦': '穷',
 '贫穷': '穷',
 '没钱': '穷',
 '世上': '世界',
 '世间': '世界',
 '人世间': '世界',
 '人间': '世界',
 '讨厌': '不喜欢',
 '有钱': '钱',
 '花钱': '钱',
 '攒钱': '钱',
 '赚钱': '钱',
 '存钱': '钱',
 '挣钱': '钱'}

In [18]:
def replace_rules(tokens,replacement_rules):
    for i in range(len(tokens)):
        if tokens[i] in replacement_rules:
            tokens[i] = replacement_rules[tokens[i]]   
        else:
            continue
    return tokens

In [19]:
comments_tokens1 = [replace_rules(comment, replacement_rules) for comment in comments_tokens]

In [20]:
comments_tokens1 = [replace_rules(comment, megatoken) for comment in comments_tokens1]

In [21]:
from collections import Counter
from itertools import chain
import csv

In [22]:
flat_tokens = list(chain.from_iterable(comments_tokens1))
token_counts = Counter(flat_tokens)
sorted_tokens = sorted(token_counts.items(), key=lambda x: (len(x[0]), -x[1]))

csv_file = 'token0201.txt'

with open(csv_file, mode='w', encoding='utf-8', newline='') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(['Token', 'Length', 'Frequency'])
    # Write each token, its length, and frequency
    for token, freq in sorted_tokens:
        writer.writerow([token, len(token), freq])

In [23]:
stokens = [token_w_count[0] for token_w_count in sorted_tokens if token_w_count[1]>1]

In [24]:
labeled_tokens = [(token.word, token.flag) for token in pseg.cut(' '.join(stokens)) if token.word!=' ']

In [25]:
txt_file = 'single_char_tokens.txt' 
single_char_tokens = [token for token, freq in sorted_tokens if freq>1 and len(token) == 1] 
with open(txt_file, mode='w', encoding='utf-8') as file: 
    for token in single_char_tokens: file.write(token + '\n')

In [26]:
keep1c = {'why'}
with open("keep_1c_token.txt", 'r', encoding='utf-8') as f:
    for line in f:
        word = line.strip()
        keep1c.update(word)

In [27]:
final_tokens = []
coin = 0
for num in range(len(labeled_tokens)):
    if len(labeled_tokens[num][0])==1 and coin==0:
        if labeled_tokens[num][1] in {'a'} and sorted_tokens[num][1]<=120:
            final_tokens.append(labeled_tokens[num][0])
        elif labeled_tokens[num][0] in keep1c:
            final_tokens.append(labeled_tokens[num][0])
    
    if len(labeled_tokens[num][0])==2:
        coin+=1
        if labeled_tokens[num][1] in {'a','v','n','vn', 'an','nr'}:
            final_tokens.append(labeled_tokens[num][0])
    
final_tokens= final_tokens+['我妈','我爸','童年','不懂','不爱','不多','没钱']

In [28]:
threenfour = [token_w_count[0] for token_w_count in sorted_tokens if len(token_w_count[0]) in {3,4} and token_w_count[1]>3]
final_tokens = final_tokens+threenfour

In [29]:
final_tokens = [token for token in final_tokens if token not in {'觉得','感觉','感到','不了','干嘛','觉得','是因为','可能','达成','造成','有点','大','想要','不想要'}]

In [30]:
final_tokens = [token for token in final_tokens if token not in {'是否','算是','像是','是从','是不是','不就是','可不是'}]

In [31]:
len(final_tokens)

1954

In [35]:
pd.DataFrame(final_tokens).to_csv('final_tokens0205.csv')

csv_file = 'parser_token_frequency.csv'
with open(csv_file, mode='w', encoding='utf-8', newline='') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(['Token', 'Length', 'Frequency'])
    # Write each token, its length, and frequency
    for token, freq in sorted_tokens:
        writer.writerow([token, len(token), freq])

txt_file = 'single_char_tokens.txt'
single_char_tokens = [token for token, freq in sorted_tokens if len(token) == 1]
with open(txt_file, mode='w', encoding='utf-8') as file:
    for token in single_char_tokens:
        file.write(token + '\n')

In [32]:
filtered_comments = [
    [
        token for token in comment if token in final_tokens
    ]
    for comment in comments_tokens1
]

In [33]:
from gensim.models import Word2Vec

In [79]:
similar_words = Word2Vec(filtered_comments, vector_size=50, window=12, min_count=2, workers=4,sg=1,epochs=10)

In [83]:
similar_words.wv.most_similar('小孩子', topn=15)

[('可爱', 0.9343622922897339),
 ('不喜欢', 0.9229201674461365),
 ('喜欢', 0.9047781229019165),
 ('处理', 0.8838791847229004),
 ('哭', 0.8522584438323975),
 ('情绪', 0.8372146487236023),
 ('幼崽', 0.8321914076805115),
 ('照顾不好', 0.8274630308151245),
 ('怕', 0.8250404000282288),
 ('脾气', 0.8204468488693237),
 ('厌恶', 0.8194323778152466),
 ('不稳定', 0.8174711465835571),
 ('外甥', 0.8164674639701843),
 ('惹事', 0.8122392892837524),
 ('狗狗', 0.8067873120307922)]

In [36]:
similar_words.wv.similarity('婚','结婚')

0.8430261

In [47]:
from gensim import corpora, models

In [48]:
final_tokens1 = [token for token in final_tokens if token not in {'孩子','小孩子','生孩子','没有','不想','不生','问题','时候'}]
filtered_comments1 = [
    [
        token for token in comment if token in final_tokens1
    ]
    for comment in comments_tokens1
]

In [49]:
dictionary = corpora.Dictionary(filtered_comments1)
dictionary.filter_extremes(no_below=2, no_above=0.8, keep_n=500)
corpus = [dictionary.doc2bow(comment) for comment in filtered_comments1]

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [61]:
lda_model = models.LdaModel(corpus_tfidf, num_topics=20, id2word=dictionary, passes=30)

In [87]:
for idx, topic in lda_model.print_topics():
    print(f"Topic: {idx+1} \nWords: {topic}\n")

Topic: 1 
Words: 0.036*"多" + 0.034*"领养" + 0.033*"原因" + 0.023*"玩" + 0.020*"共情" + 0.018*"家庭" + 0.018*"条件" + 0.017*"受罪" + 0.016*"希望" + 0.016*"意义"

Topic: 2 
Words: 0.055*"世界" + 0.053*"自私" + 0.048*"妈妈" + 0.034*"劝" + 0.032*"受苦" + 0.031*"来到" + 0.027*"不活" + 0.027*"体验" + 0.023*"明白" + 0.021*"自由"

Topic: 3 
Words: 0.048*"我妈" + 0.027*"信心" + 0.023*"经历" + 0.021*"决定" + 0.021*"负担" + 0.019*"情绪" + 0.018*"状态" + 0.018*"准备" + 0.016*"做好" + 0.015*"不稳定"

Topic: 4 
Words: 0.042*"责任" + 0.035*"承担" + 0.034*"照顾" + 0.031*"无所谓" + 0.030*"婚" + 0.022*"生不生" + 0.021*"愿意" + 0.021*"人家" + 0.021*"能力" + 0.019*"选择"

Topic: 5 
Words: 0.034*"想想" + 0.025*"操心" + 0.024*"提供" + 0.021*"事情" + 0.020*"无法" + 0.019*"拒绝" + 0.018*"低" + 0.018*"崩溃" + 0.018*"满足" + 0.017*"病"

Topic: 6 
Words: 0.077*"苦难" + 0.073*"养孩子" + 0.058*"喜欢" + 0.056*"怕" + 0.053*"催" + 0.052*"猫" + 0.047*"结婚" + 0.034*"养不好" + 0.026*"穷" + 0.021*"应该"

Topic: 7 
Words: 0.088*"穷" + 0.028*"家庭" + 0.023*"思想" + 0.019*"改变" + 0.017*"影响" + 0.017*"接受" + 0.016*"祸害" + 0.015*"年龄" + 0.015*"不知

In [63]:
doc_topic_dist = []
for doc in corpus:
    doc_topics = lda_model.get_document_topics(doc, minimum_probability=0.0)
    doc_topic_dist.append([topic[1] for topic in doc_topics])

In [64]:
doc_topic_df = pd.DataFrame(doc_topic_dist, columns=[f"Topic {i+1}" for i in range(lda_model.num_topics)])
doc_topic_df["Document"] = [comment for comment in comments]

In [90]:
top_docs_per_topic = {}
for topic_num in range(lda_model.num_topics):
    sorted_docs = doc_topic_df.sort_values(by=f"Topic {topic_num+1}", ascending=False)
    top_docs_per_topic[f"Topic {topic_num+1}"] = sorted_docs.head(7)  # Top 3 documents for this topic

In [91]:
for topic, top_docs in top_docs_per_topic.items():
    print(f"\n{topic}:")
    print(top_docs[["Document", f"{topic}"]])


Topic 1:
                                               Document   Topic 1
650   我不想因为自己没金钱没时间没能力还要生孩子然后让ta受苦让ta自卑，比起ta懂事听话地说不用...  0.932137
64          暂时不要，等我有能力赚到钱了，或者人口少到一定地步，买房子生孩子成本降低的时候再考虑生  0.881245
195   人活一辈子无非受苦受难真正开心的日子感觉不超过100天所以除非家里真的有矿要不就别生了苦孩子...  0.864284
1131  深有感触，所以作为老大的我，真的无法对我爸妈有太大的共情，当初决定生四胎的是他们，很大一定程...  0.864279
1309  生孩子的意义是什么呢？各有说法没有正答，我觉得能找到自己生孩子的意义很重要，但现实里可能是被...  0.864277
1713                 我选择不生的原因是因为现在社会已经够卷够苦了，何必生孩子让孩子受罪呢  0.841665
364       能力不行的人生孩子既毁了自己也毁了孩子。因为原生家庭的原因，我对结婚生儿育女没有任何期待。  0.841665

Topic 2:
                                               Document   Topic 2
447                  呃我活的太幸福了，我只能享受被爱，我不想付出我就是自私，自私活的开心  0.894440
827   其实自私在这方面也有不同的表现。。有人选择生孩子拖孩子下水，还有我这种怪人，做不到无私地利他...  0.881245
759        如果有机会做选择我不愿意来到这个世界上，既然我不想要为什么要让另一个人也体验一下[微笑]  0.864283
44    我只是小姨，陪外甥睡觉，小孩子晚上睡觉很不老实，经常踢被子，而且有时会睡着睡着哭起来，作为一...  0.864278
492                    孩子只会重复我经受的苦难，不让他来到这个世界是我对他最大的善意。  0.841665
1486                  我对我的孩子最大的爱就是别让他or她来到这个世界上受苦受难[笑哭] 

In [94]:
comments[1654]

'但凡看完生育的各种疾病风险、进一步再看几部论文或纪录片，还能坚定生孩子的女性，我是真的佩服，我是绝对不干这亏到姥姥家的事了，我不劝别人 别人也别劝我好吧。'

In [80]:
def kmeans_clustering(model, num_clusters):
    words = list(model.wv.index_to_key)
    word_vectors = model.wv.vectors
    
    kmeans = KMeans(n_clusters=num_clusters, init='k-means++', random_state=42)
    kmeans.fit(word_vectors)
    
    clusters = {}
    for i, word in enumerate(words):
        cluster = kmeans.labels_[i]
        if cluster not in clusters:
            clusters[cluster] = [word]
        else:
            clusters[cluster].append(word)
    
    return clusters

In [81]:
clusters = kmeans_clustering(similar_words, 10)

  super()._check_params_vs_input(X, default_n_init=10)


In [79]:
from sklearn.cluster import KMeans

In [82]:
clusters

{2: ['孩子',
  '没有',
  '生活',
  '不能',
  '能力',
  '条件',
  '我的孩子',
  '教育',
  '考虑',
  '时间',
  '环境',
  '我的父母',
  '童年',
  '保证',
  '足够',
  '给孩子',
  '受苦',
  '给不了',
  '家长',
  '办法',
  '快乐',
  '最好',
  '健康',
  '没办法',
  '成长',
  '下一代',
  '提供',
  '陪伴',
  '耐心',
  '拥有',
  '信心',
  '物质条件',
  '指望',
  '培养',
  '满足',
  '物质',
  '能够',
  '做到',
  '担心',
  '需求',
  '机会',
  '生活质量',
  '给予',
  '资源',
  '充足',
  '富足',
  '富裕',
  '金钱',
  '心态',
  '良好',
  '健全',
  '挣扎'],
 9: ['生孩子',
  '不生',
  '不会',
  '知道',
  '身体',
  '自私',
  '希望',
  '需要',
  '一个人',
  '经历',
  '苦难',
  '社会',
  '明白',
  '死',
  '肯定',
  '决定',
  '一辈子',
  '付出',
  '孤独',
  '领养',
  '最大',
  '作为',
  '不让',
  '带孩子',
  '简单',
  '基因',
  '风险',
  '重要',
  '性格',
  '影响',
  '想想',
  '看看',
  '支持',
  '美好',
  '理由',
  '告诉',
  '改变',
  '感受',
  '期待',
  '不稳定',
  '遗传',
  '普通人',
  '精神',
  '得到',
  '延续',
  '不确定',
  '婚姻',
  '优秀',
  '更何况',
  '疾病',
  '不幸福',
  '怕疼',
  '伟大',
  '清楚',
  '变得',
  '无所谓',
  '意义',
  '没能力',
  '状态',
  '行为',
  '善良',
  '照顾不好',
  '抚养',
  '心理',
  '脾气',
  '不适合',
  '实现',
  '老一辈',
  '才能',