# 2. segment
## 2.1 NLTK segment

In [None]:
import re
import nltk
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import MWETokenizer
from nltk.corpus import wordnet
wnl = WordNetLemmatizer()
import pandas as pd


In [None]:

output_path = './result'
data = pd.read_excel("./seg.xlsx").astype(str)  # content type

In [None]:
def get_word_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None


In [None]:
def word_cut(mytext):
    #de-punctuate
    for c in string.punctuation:
        if c != '-':
            mytext = mytext.replace(c, ' ')
    #Word segmentation, add custom phrases, remove stop words
    tokenizer = MWETokenizer(
        [('Python', 'programs'), ('a', 'little', 'bit'), ('a', 'lot')], separator='-')
    wordlist = tokenizer.tokenize(nltk.word_tokenize(mytext))
    filtered = [w for w in wordlist if w not in stopwords.words('english')]
    #Part of speech
    refiltered = nltk.pos_tag(filtered)
    #Morphology reduction
    lemmas_sent = []
    for wordtag in refiltered:
        wordnet_pos = get_word_pos(wordtag[1]) or wordnet.NOUN
        word = wnl.lemmatize(wordtag[0], pos=wordnet_pos)
        # print(word)
        lemmas_sent.append(word)  # Morphology reduction
    # print(lemmas_sent)
    return (" ").join(lemmas_sent)


In [None]:
data["content_cutted"] = data.post.apply(word_cut)

print(data["content_cutted"])


Save all word segmentation results to save subsequent time

In [None]:
data.to_excel("./seg.xlsx", index=False)


## 2.2 LDA

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


In [None]:
def print_top_words(model, feature_names, n_top_words):
    tword = []
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        topic_w = " ".join([feature_names[i]
                            for i in topic.argsort()[:-n_top_words - 1:-1]])
        tword.append(topic_w)
        print(topic_w)
    return tword


In [None]:
n_features = 1000  # Extract 1000 feature words
tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                max_features=n_features,
                                stop_words='english',
                                max_df=0.5,
                                min_df=10)

# tf = tf_vectorizer.fit_transform(data.content_cutted)
data['age'] = data['age'].astype(int)
tf = tf_vectorizer.fit_transform(data.loc[data['age'] <= 20, 'content_cutted'])


- When building a topic model, in addition to the dictionary and corpus, we also need to specify parameters for the model, such as the number of topics.
- num_topics represents the number of topics to generate.
- chunksize is the number of documents used in each training chunk.
- update_every determines the frequency of updating the model parameters.
- passes is the total number of training passes.


In [None]:
n_topics = 6 #topic number
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50,
                                learning_method='batch',
                                learning_offset=50,
                                 doc_topic_prior=0.1,
                                 topic_word_prior=0.01,
                               random_state=0)
lda.fit(tf)

The above LDA model is constructed by four different topics, in which each topic is a combination of keywords, and each keyword contributes a certain weight to the topic, and the weight reflects the contribution degree of the keyword to the subject.

num_word represents the number of key words for each topic

### 2.2.1 Output words for each topic

In [None]:
n_top_words = 20
tf_feature_names = tf_vectorizer.get_feature_names()
topic_word = print_top_words(lda, tf_feature_names, n_top_words)


### 2.2.2 Output each article corresponding to the topic

In [None]:
import numpy as np


In [None]:
topics = lda.transform(tf)


In [None]:
topic = []
for t in topics:
    topic.append("Topic #"+str(list(t).index(np.max(t))))
data['The topic number with the highest probability'] = topic
data['Each topic corresponds to a probability'] = list(topics)
data.to_excel("./result/data_topic_under6.xlsx", index=False)


### 2.2.3 Visualisation

In [None]:
import pyLDAvis
import pyLDAvis.sklearn
# The pandas package version must be greater than 1.3.1


In [None]:
pyLDAvis.enable_notebook()
pic = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
pyLDAvis.display(pic)
pyLDAvis.save_html(pic, 'lda_pass'+str(n_topics)+'_below.html')

#Go to the working path and look for the saved html file


### 2.2.4 Perplex calculation

In [None]:
import matplotlib.pyplot as plt


In [None]:
plexs = []
scores = []
n_max_topics = 10
for i in range(1, n_max_topics):
    print(i)
    lda = LatentDirichletAllocation(n_components=i, max_iter=50,
                                    learning_method='batch',
                                    learning_offset=50, random_state=0)
    lda.fit(tf)
    plexs.append(lda.perplexity(tf))
    scores.append(lda.score(tf))


In [None]:
n_t = 9  # The value on the far right of the interval. Note: cannot be greater than n_max_topics
x = list(range(1, n_t+1))
plt.plot(x, plexs[0:n_t])
plt.xlabel("number of topics")
plt.ylabel("perplexity")
plt.show()


In [None]:
print(x, plexs[0:n_t])


# 3.k-means

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
#from .data_utils import *
import jieba
import matplotlib.pyplot as plt
import os
import pandas as pd
import string
os.chdir("./")


In [None]:
'''
    1. Load corpus and divide words
'''
data = pd.read_excel("./dataForCode/output.xlsx")
corpus = []

# bigram
def segment_bigram(text): return " ".join(
    [word + text[idx + 1] for idx, word in enumerate(text) if idx < len(text) - 1])


for line in data['post'].astype(str):
# for line in data.loc[data['occupation'] == 'Student', 'post'].astype(str):
    # de-punctuate
    corpus.append(word_cut(line.strip()))


Later

In [None]:
'''
    1. Load corpus and divide words
'''
data = pd.read_excel("./seg.xlsx")
corpus = []

# for line in data.content_cutted.astype(str):
data['age'] = data['age'].astype(int)

for line in data.loc[data['age'] <= 20, 'content_cutted'].astype(str):
    # 去掉标点符号
    corpus.append(line)


In [None]:
data.head()


In [None]:
'''
    2、Calculate tf-idf set as weight
'''

vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus[:10000]))

''' 
3. Obtain all word features in the word bag model
If the number of features is very large, the dimension can be reduced according to the weight
'''

word = vectorizer.get_feature_names()
print("word feature length: {}".format(len(word)))

''' 
    4、The process of vectorizing text is achieved by exporting the weights here, and each row in the matrix is a vector representation of a document
'''
tfidf_weight = tfidf.toarray()


 Draw Inertia_

In [None]:
import matplotlib.pyplot as plt
inertia = []
scores = []
n_max_topics = 10
for i in range(1, n_max_topics):
    print(i)
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(tfidf_weight)
    inertia.append(kmeans.inertia_)

n_t = 9  # The value on the far right of the interval. Note: cannot be greater than n_max_topics
x = list(range(1, n_t+1))
plt.plot(x, inertia[0:n_t])
plt.xlabel("number of topics")
plt.ylabel("inertia")
plt.show()


In [None]:
print(x, inertia[0:n_t])


In [None]:
'''
    5、Clustering of vectors
'''

#   Specifies that it is divided into n classes
kmeans = KMeans(n_clusters=3)
kmeans.fit(tfidf_weight)

# Print out the center points of each cluster
print("Center point coordinates:")
print(kmeans.cluster_centers_)
# for index, label in enumerate(kmeans.labels_, 1):
#     print("index: {}, label: {}".format(index, label))




In [None]:
# The sum of the square distance of the sample from its nearest cluster center is used to judge the accuracy of the classification. The smaller the value, the better
# The hyperparameter n_clusters of k-means can be evaluated by this value
print("Effect evaluation value:")
print("inertia: {}".format(kmeans.inertia_))

##Save the result to excel
data['label'] = kmeans.labels_
data.to_excel("./result/data_labeled_student.xlsx", index=False)


In [None]:
'''
    6、visualization
'''

# T-SNE algorithm is used to reduce the dimension of weights, which is more accurate than PCA algorithm, but takes a long time
tsne = TSNE(n_components=2)
decomposition_data = tsne.fit_transform(tfidf_weight)

x = []
y = []

for i in decomposition_data:
    x.append(i[0])
    y.append(i[1])

fig = plt.figure(figsize=(10, 10))
ax = plt.axes()
plt.scatter(x, y, c=kmeans.labels_, marker="x")
plt.xticks(())
plt.yticks(())
plt.show()
plt.savefig('./result/sample_below3.png', aspect=1)
