In [1]:
#_*_ 性别识别：启发式方法，姓名的最后几个字符可以界定性别特征 _*_#
import random
from nltk.corpus import names
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy as nltk_accuracy

# 提取输入单词的特征
def gender_features(word, num_letters=2):
    return {'feature': word[-num_letters:].lower()}

# 提取标记
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])

# 生成随机种子数，并搅乱训练数据
random.seed(7)
random.shuffle(labeled_names)

input_names = []
for i in range(1,5):
    print('\nNumbers of letters :',i)
    featuresets = [ (gender_features(n,i),gender) for (n, gender) in labeled_names]

    train_set, test_set = featuresets[500:], featuresets[:500]
    classifier = NaiveBayesClassifier.train(train_set)
    print('\nAccuracy =',str(100*nltk_accuracy(classifier, test_set)) + str('%'))

    for name in input_names:
        print(name,'-->',classifier.classify(gender_features(name, i)))

SyntaxError: unexpected EOF while parsing (<ipython-input-1-b6934ef8bfaf>, line 20)

In [7]:
#_*_ 情感分析：分析人对某个特定主题的看法（积极、消极、中性） _*_#
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews

# NLTK分类器的数据需要以字典格式存储
def extract_features(word_list):
    return dict([(word, True) for word in word_list])

# load datasets
positive_fileids = movie_reviews.fileids('pos')
negative_fileids = movie_reviews.fileids('neg')

# 区分积极与消极评论
features_positive = [(extract_features(movie_reviews.words(fileids=[f])),'Positive')
                    for f in positive_fileids]
features_negative = [(extract_features(movie_reviews.words(fileids=[f])),'Negative') 
                     for f in negative_fileids]

threshold_factor = 0.8
threshold_positive = int(threshold_factor * len(features_positive))
threshold_negative = int(threshold_factor * len(features_negative))

# extract features
features_train = features_positive[:threshold_positive] + features_negative[:threshold_negative]
features_test = features_positive[threshold_positive:] + features_negative[threshold_negative:]

print("\nNumber of training datapoints:",len(features_train))
print('\nNumber of testing datapoints:',len(features_test))

# training and valuate
classifier = NaiveBayesClassifier.train(features_train)
print('\nAccuracy of the classifier :',nltk.classify.util.accuracy(classifier, features_test))

# print
print('\nTop 10 most information words:')
for item in classifier.most_informative_features()[:10]:
    print(item[0])
    
# predict
input_reviews = [
"It is an amazing movie",
"This is a dull movie. I would never recommend it to anyone.",
"The cinematography is pretty great in this movie",
"The direction was terrible and the story was all over the place",
"outstanding insulting"
]

print("\nPredictions:")
for review in input_reviews:
    print('\nReview:',review)
    probdist = classifier.prob_classify(extract_features(review.split()))
    pred_sentiment = probdist.max()
    print("Predicted sentiment:",pred_sentiment)
    print("Probability:",round(probdist.prob(pred_sentiment), 2))


Number of training datapoints: 1600

Number of testing datapoints: 400

Accuracy of the classifier : 0.735

Top 10 most information words:
outstanding
insulting
vulnerable
ludicrous
uninvolving
astounding
avoids
fascination
affecting
animators

Predictions:

Review: It is an amazing movie
Predicted sentiment: Positive
Probability: 0.61

Review: This is a dull movie. I would never recommend it to anyone.
Predicted sentiment: Negative
Probability: 0.77

Review: The cinematography is pretty great in this movie
Predicted sentiment: Positive
Probability: 0.67

Review: The direction was terrible and the story was all over the place
Predicted sentiment: Negative
Probability: 0.63

Review: outstanding insulting
Predicted sentiment: Positive
Probability: 0.5


In [13]:
#_*_ 主题建模：识别一组文档的隐藏主题模式 _*_#
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from gensim import models, corpora
from nltk.corpus import stopwords

# load data
def load_data(input_file):
    data = []
    with open(input_file,'r') as f:
        for line in f.readlines():
            data.append(line[:-1])
    
    return data

# 预处理文本类：从输入文本中提取相关的特征
class Preprocessor(object):
    def __init__(self):
        # 默认使用正则表达式解析器
        self.tokenizer = RegexpTokenizer(r'\w+')
        
        # stop words
        self.stop_words_english = stopwords.words('english')
        
        # stemmer
        self.stemmer = SnowballStemmer('english')
        
    # 处理函数：标记解析、停用词去除、词干还原
    def process(self, input_text):
        # 标记解析
        tokens = self.tokenizer.tokenize(input_text.lower())
        # 移除停用词
        tokens_stopwords = [x for x in tokens if not x in self.stop_words_english]
        # 词干提取
        tokens_stemmed = [self.stemmer.stem(x) for x in tokens_stopwords]
        
        return tokens_stemmed
    
input_file = 'data_topic_modeling.txt'
data = load_data(input_file)
preprocessor = Preprocessor()

# 处理文本，并提取已处理好的标记
processed_tokens = [preprocessor.process(x) for x in data]

# 创建基于标记文档的字典
dict_tokens = corpora.Dictionary(processed_tokens)
print(dict_tokens)

# 字典转为文档-词矩阵
corpus = [dict_tokens.doc2bow(text) for text in processed_tokens]
print(corpus)

# 隐Dirichlet分布(LDA)做主题建模
num_topics = 2
num_words = 4
# fit
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dict_tokens, passes=25)

print('\nMost contributing words to the topics:')
for item in ldamodel.print_topics(num_topics=num_topics, num_words=num_words):
    print('\nTopic',item[0],' => ',item[1])
    

Dictionary(42 unique tokens: ['cryptographi', 'lot', 'spent', 'studi', 'time']...)
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)], [(13, 1), (14, 1), (15, 1), (16, 1)], [(17, 1), (18, 1), (19, 1), (20, 1)], [(8, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1)], [(28, 1), (29, 1), (30, 1), (31, 1)], [(8, 1), (9, 1), (32, 1), (33, 1), (34, 1), (35, 1)], [(8, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1)]]

Most contributing words to the topics:

Topic 0  =>  0.070*"need" + 0.050*"order" + 0.030*"system" + 0.030*"understand"

Topic 1  =>  0.039*"promot" + 0.039*"talent" + 0.039*"train" + 0.039*"younger"


In [None]:
'''
主题建模，通过识别文档中最有意义、最能表征主题的词来实现主题分类

文本解析(分词) -> 去除停用词(除噪) -> 词干还原(归一化)

LDA是生成主题的模型：找出所有主题，再生成给定主题的文档
'''