In [None]:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict

# 财新网保存有《方方日记》60篇，网址如下
base_url = r"http://m.app.caixin.com/m_topic_detail/1489.html"

#获取每篇日记的网址，并保存在一个字典里（方便后面生成dataframe）
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
base_page_content = requests.get(base_url, headers=headers).text
soup = BeautifulSoup(base_page_content, "lxml")
diary_data = defaultdict(list)

# 观察base_page可知，日记相关信息都在<li></li>标签内。解析所有<li></li>标签。
for index, li in enumerate(soup.find_all('li')):
    try:
        # 保存日记发表日期，标题和网址为一个元组
        diary_data[index] = [li.em.text, li.a['title'], li.a['href']]
    except Exception as e:
        print(e)

In [None]:
import time
# 定义一个函数，获取日记正文
def get_diary(url):
    time.sleep(0.5) # 间隔0.5秒，以免超过限制
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    try:
        diary_page = requests.get(url).text
        soup = BeautifulSoup(diary_page, 'lxml')
        content = soup.find('div', {'class': 'blog_content'}).text
    except Exception as e:
        print(e)
    return content

for key in diary_data:
    url = diary_data[key][2]
    content = get_diary(url)[:-19] # 每个页面结尾有18个重复的和日记无关的字符，去掉。
    # 同时保存一个文本文件,编码为gbk，因为百度情感倾向分析只认gbk。
    with open(diary_data[key][1]+'\.txt', 'w', encoding='gbk') as f:
        f.write(content)

接下来是清理数据，改文件名。因为发表日期和日记日期不一定相同，而且方方有时候用公历，有时候用就离，需要一些手动的操作。最后文件名统一格式为"nn-yyyymmdd.txt"。

In [None]:
# 连接百度情感倾向分析API
from aip import AipNlp

APP_ID = 'yourid'
API_KEY = 'your api key'
SECRET_KEY = 'your secret key'

client = AipNlp(APP_ID, API_KEY, SECRET_KEY)

In [None]:
# 定义两个函数，用于获取情感倾向值

import collections
from statistics import mean

def get_sentiment(senti_list):
    # 本函数获取一个文本的情感倾向
    # 因为情感倾向是定名数据，不能取平均值，只能取频率最高的
    counter = collections.Counter(senti_list)
    if counter[0] == counter[1] == counter[2]:
        sentiment = 1
    else:
        sentiment = counter.most_common()[0][0]
    return sentiment
        
def analyzeSentiment(text):
    with open(text, 'r', encoding='gbk') as f:
            content = f.read()
    # 百度舆情分析一次不能超过2047个字节，因此切割为500个字符的块，每篇日记取算术平均值
    if len(content) <= 500:
        result = client.sentimentClassify(content)
        print(result['items'])
        positive_prob = result['items'][0]['positive_prob']
        negative_prob = result['items'][0]['negative_prob']
        sentiment = result['items'][0]['sentiment']
        confidence = result['items'][0]['confidence']
    else:
        k = len(content) // 500
        positive_prob_list = []
        negative_prob_list = []
        sentiment_list = []
        confidence_list = []
        # 先处理文本最后小于500词的片段
        text = content[k*500:]
        result = client.sentimentClassify(text)
        print(result['items'])
        positive_prob_list.append(result['items'][0]['positive_prob'])
        negative_prob_list.append(result['items'][0]['negative_prob'])
        sentiment_list.append(result['items'][0]['sentiment'])
        confidence_list.append(result['items'][0]['confidence'])
        # 再处理前面k个500词的片段
        for i in range(k):
            time.sleep(0.6)
            text = content[i*500:i*500+500]
            result = client.sentimentClassify(text)
            print(result['items'])
            positive_prob_list.append(result['items'][0]['positive_prob'])
            negative_prob_list.append(result['items'][0]['negative_prob'])
            sentiment_list.append(result['items'][0]['sentiment'])
            confidence_list.append(result['items'][0]['confidence'])
        positive_prob = mean(positive_prob_list)
        negative_prob = mean(negative_prob_list)
        sentiment = get_sentiment(sentiment_list)
        confidence = mean(confidence_list)
    return [positive_prob, negative_prob, sentiment, confidence]

In [None]:
# 读取文本，获取情感倾向值

import time
import glob
import os
import pandas as pd
import numpy as np

os.chdir(r'your directory')
diaries = [diary for diary in glob.glob('*.txt') if '-' in diary]

df = pd.DataFrame(
    columns=['number', 'date', 'positive_prob', 'negative_prob', 'sentiment', 'confidence'],
)

for diary in diaries:
    time.sleep(0.6) # 百度舆情分析每秒有两次限制，因此设定间隔0.6秒
    print('Processing '+diary)
    number = diary.split('-')[0] # 文件名格式为nn-yyyymmdd.txt
    date = diary.split('.')[0][3:]
    senti_values = analyzeSentiment(diary)
    df_new = pd.DataFrame(
        np.array([[number, date, senti_values[0], senti_values[1], senti_values[2], senti_values[3]]]),
        columns=['number', 'date', 'positive_prob', 'negative_prob', 'sentiment', 'confidence'],
    )
    df = df.append(df_new)

In [None]:
df = df.set_index('number')

# 因为上面没有设置dataframe里的数据类型，这里分别设置一下
# 除date设置为日期格式以外，其它列都设置为数值

for c in df.columns:
    if c == 'date':
        df[c] = pd.to_datetime(df[c])
    else:
        df[c] = pd.to_numeric(df[c])


In [None]:
# 现在可以看一下积极、消极概率的折线图了
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [20, 8]

number = df.index
plt.xticks = number
pp = df['positive_prob']
np = df['negative_prob']
sent = df['sentiment']
plt.plot(number, pp, color='red')
plt.plot(number, np, color='green')
#plt.plot(number, sent, color='black')
    
plt.title('Sentiment Analysis', fontsize=20)
plt.xlabel('diary', fontsize=15)
plt.ylabel('probability', fontsize=15)
plt.show()

In [None]:
sentiment = ('negtive', 'neutral', 'positive')
y_pos = np.arange(len(sentiment))
freq = [41,1,18]

plt.bar(sentiment, freq, align='center', alpha=0.5)
#plt.xticks(y_pos, sentiment)
plt.ylabel('freq')
plt.title('Sentiment Analysis')
plt.show()


In [None]:
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt

filenames = ['01-20200125.txt', '23-20200216.txt', '32-20200225.txt', 
             '15-20200208.txt', '27-20200220.txt', '58-20200322.txt']
fontpath = r'msyh.ttc'

def make_cloud(file):
    with open('cn_stopwords.txt', 'r', encoding='utf8') as f:
        stopwords = f.readlines()
    stopwords = [word.strip() for word in stopwords]
    with open(file, 'r') as f:
        text = f.read()
    text = " ".join([word for word in jieba.cut(text) if word not in stopwords])
    wordcloud = WordCloud(font_path=fontpath).generate(text)
    return wordcloud
    

for file in filenames:
    wordcloud = make_cloud(file)
    print('wordcloud for {}'.format(file))
    %pylab inline
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()