In [4]:
import os
import time
import cmfd 
import jieba  
import threading
import numpy as np 
import pandas as pd
import moralstrength
from functools import partial
from tqdm.notebook import tqdm
from multiprocessing import Pool
from moralstrength import lexicon_use
from nltk.tokenize import word_tokenize
from concurrent.futures import ThreadPoolExecutor 
from moralstrength.moralstrength import estimate_morals

## 情感词典

### 样例数据

#### 大连理工中文词典

In [4]:
# 加载数据
df_chn = pd.read_excel('text_analysis_weibo_sample.xlsx', index_col = 0)
df_chn.head()

Unnamed: 0,index,标题/微博内容,点赞,转发,评论,账号昵称UID加密,粉丝数,关注数,地域
0,34121,国债：地产行业重磅利好提振风险偏好，期债低开低走 国债期货全线收跌，10年期主力...,0,0,0,e5df796860e68f403bcf9651bab4d42e,0,0,其他
1,40230,#喜迎二十大 忠诚保平安#,0,0,0,6e35cb69ad52f20de5e28197b2e85306,405444,252,广西
2,7714,注意！事关明日教资考试！福建省教育考试院发布补充公告 福建省2022年下半年全国中小学教师...,0,0,0,e6953217442e6c06a7af23eee5e185f2,53264,2177,福建
3,27378,近日，“千年大计”雄安新区迎来五周岁生日。从“一张白纸...,0,0,0,,0,0,北京
4,15435,樊振东牛逼！,0,0,0,344af41eac516375c04dee6325e763cc,8,51,山东


In [5]:
# 加载词典
dlut = pd.read_excel('DLUT-Emotion-Lexicon.xlsx', usecols=['词语', '词性种类', '情感分类', '强度'])
dlut.head()

Unnamed: 0,词语,词性种类,情感分类,强度
0,脏乱,adj,NN,7
1,糟报,adj,NN,5
2,早衰,adj,NE,5
3,责备,verb,NN,5
4,贼眼,noun,NN,5


In [6]:
# 整理情感词典
Happy, Good, Surprise, Anger, Sad, Fear, Disgust  = [], [], [], [], [], [], []

for idx, row in dlut.iterrows():
    if row['情感分类'] in ['PA', 'PE']:
        Happy.append(row['词语'])
    if row['情感分类'] in ['PD', 'PH', 'PG', 'PB', 'PK']:
        Good.append(row['词语']) 
    if row['情感分类'] in ['PC']:
        Surprise.append(row['词语'])     
    if row['情感分类'] in ['NA']:
        Anger.append(row['词语'])    
    if row['情感分类'] in ['NB', 'NJ', 'NH', 'PF']:
        Sad.append(row['词语'])
    if row['情感分类'] in ['NI', 'NC', 'NG']:
        Fear.append(row['词语'])
    if row['情感分类'] in ['NE', 'ND', 'NN', 'NK', 'NL']:
        Disgust.append(row['词语'])
Positive = Happy + Good + Surprise
Negative = Anger + Sad + Fear + Disgust

In [7]:
emo_dlut = pd.DataFrame(columns=['length_dlut', 'positive_dlut', 'negative_dlut',
                                'anger_dlut', 'disgust_dlut', 'fear_dlut', 'good_dlut',
                                'sadness_dlut', 'surprise_dlut', 'happy_dlut'])

for dc in df_chn.index:
    positive, negative, anger, disgust, fear, sad, surprise, good, happy = 0, 0, 0, 0, 0, 0, 0, 0, 0
    wordlist = list(jieba.cut(df_chn['标题/微博内容'][dc]))
    wordset = set(wordlist)
    wordfreq = []
    for word in wordset:
        freq = wordlist.count(word)
        if word in Positive:
            positive += freq
        if word in Negative:
            negative += freq
        if word in Anger:
            anger += freq
        if word in Disgust:
            disgust += freq
        if word in Fear:
            fear += freq
        if word in Sad:
            sad += freq
        if word in Surprise:
            surprise += freq
        if word in Good:
            good += freq
        if word in Happy:
            happy += freq
            
    emotion_info = {
        'length_dlut': len(wordlist),
        'positive_dlut': positive,
        'negative_dlut': negative,
        'anger_dlut': anger,
        'disgust_dlut': disgust,
        'fear_dlut': fear,
        'good_dlut': good,
        'sadness_dlut': sad,
        'surprise_dlut': surprise,
        'happy_dlut': happy
    }
    
    emo_info = pd.DataFrame([emotion_info])
    emo_dlut = pd.concat([emo_dlut, emo_info], ignore_index=True)
    
emo_dlut.head()

Building prefix dict from C:\Users\lenovo\anaconda3\Lib\site-packages\jieba\dict.txt ...
Dumping model to file cache C:\Users\lenovo\AppData\Local\Temp\jieba.cache
Loading model cost 2.9429116249084473 seconds.
Prefix dict has been built succesfully.


Unnamed: 0,length_dlut,positive_dlut,negative_dlut,anger_dlut,disgust_dlut,fear_dlut,good_dlut,sadness_dlut,surprise_dlut,happy_dlut
0,1947,58,13,0,8,3,45,2,0,13
1,7,1,0,0,0,0,1,0,0,0
2,418,16,1,0,1,0,14,0,0,2
3,2351,120,3,0,1,1,103,1,0,17
4,4,0,0,0,0,0,0,0,0,0


In [10]:
df_chn = pd.concat([df_chn, emo_dlut], axis=1)
# df_chn.head(1)

#### NRC情感词典（NRC Emotion Lexicon）

In [11]:
# 加载数据
df_eng = pd.read_excel('text_analysis_twitter_sample.xlsx', index_col = 0)
df_eng.head()

Unnamed: 0,index,id,screen_name,time,link,text,source
0,49374,890587249372524544,auctnr1,2017-07-27T10:58:41-04:00,https://www.twitter.com/Reuters/statuses/89058...,"RT @Reuters MORE: Top U.S. general says, given...",Twitter for iPhone
1,83246,899354463055618048,SenatorTester,2017-08-20T15:36:27-04:00,https://www.twitter.com/SenatorTester/statuses...,T-minus 2 days until our first-ever Last Best ...,Twitter Web Client
2,100988,903272105738985472,KeithRothfus,2017-08-31T11:03:46-04:00,https://www.twitter.com/KeithRothfus/statuses/...,Please know that help is available. Visit http...,Twitter Web Client
3,193395,921001114409021440,HASCRepublicans,2017-10-19T09:12:31-04:00,https://www.twitter.com/HASCRepublicans/status...,Literally flying the wings off the A-10 in fig...,Twitter Web Client
4,12662,884911451449774080,SteveKnight25,2017-07-11T19:05:05-04:00,https://www.twitter.com/SteveKnight25/statuses...,Today the House unanimously passed my bill #HR...,Twitter Web Client


In [13]:
# 加载词典
nrc = pd.read_excel('NRC-Emotion-Lexicon.xlsx', usecols='A,AP:AY')
nrc = nrc.rename(columns={'English Word':'Engword'})
nrc.head()

  warn(msg)


Unnamed: 0,Engword,Positive,Negative,Anger,Anticipation,Disgust,Fear,Joy,Sadness,Surprise,Trust
0,aback,0,0,0,0,0,0,0,0,0,0
1,abacus,0,0,0,0,0,0,0,0,0,1
2,abandon,0,1,0,0,0,1,0,1,0,0
3,abandoned,0,1,1,0,0,1,0,1,0,0
4,abandonment,0,1,1,0,0,1,0,1,1,0


In [14]:
# 构建词典
Positive, Negative, Anger, Anticipation, Disgust, Fear, Joy, Sadness, Surprise, Trust = [], [], [], [], [], [], [], [], [], []

for idx, row in nrc.iterrows():
    if row['Positive'] == 1:
        Positive.append(row['Engword'])
    if row['Negative'] == 1:
        Negative.append(row['Engword'])
    if row['Anger'] == 1:
        Anger.append(row['Engword'])
    if row['Anticipation'] == 1:
        Anticipation.append(row['Engword'])
    if row['Disgust'] == 1:
        Disgust.append(row['Engword'])
    if row['Fear'] == 1:
        Fear.append(row['Engword'])
    if row['Joy'] == 1:
        Joy.append(row['Engword'])
    if row['Sadness'] == 1:
        Sadness.append(row['Engword'])
    if row['Surprise'] == 1:
        Surprise.append(row['Engword'])
    if row['Trust'] == 1:
        Trust.append(row['Engword'])

In [15]:
# 计算情感
emo_nrc_eng = pd.DataFrame(columns=['length_nrc', 'positive_nrc', 'negative_nrc',
                                    'anger_nrc', 'anticipation_nrc', 'disgust_nrc', 'fear_nrc',
                                    'joy_nrc', 'sadness_nrc', 'surprise_nrc', 'trust_nrc'])

for de in df_eng.index:
    positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, trust = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    text = df_eng['text'][de].lower()
    wordlist = text.split()
    wordset = set(wordlist)
    wordfreq = []
    
    for word in wordset:
        freq = wordlist.count(word)
        if word in Positive:
            positive += freq
        if word in Negative:
            negative += freq
        if word in Anger:
            anger += freq
        if word in Anticipation:
            anticipation += freq
        if word in Disgust:
            disgust += freq
        if word in Fear:
            fear += freq
        if word in Joy:
            joy += freq
        if word in Sadness:
            sadness += freq
        if word in Surprise:
            surprise += freq
        if word in Trust:
            trust += freq
            
    emotion_info = {
        'length_nrc': len(wordlist),
        'positive_nrc': positive,
        'negative_nrc': negative,
        'anger_nrc': anger,
        'anticipation_nrc': anticipation,
        'disgust_nrc': disgust,
        'fear_nrc': fear,
        'joy_nrc': joy,
        'sadness_nrc': sadness,
        'surprise_nrc': surprise,
        'trust_nrc': trust
    }
    
    emo_info = pd.DataFrame([emotion_info])
    emo_nrc_eng = pd.concat([emo_nrc_eng, emo_info], ignore_index=True)
    
emo_nrc_eng.head()

Unnamed: 0,length_nrc,positive_nrc,negative_nrc,anger_nrc,anticipation_nrc,disgust_nrc,fear_nrc,joy_nrc,sadness_nrc,surprise_nrc,trust_nrc
0,22,2,0,0,1,0,0,0,0,0,2
1,18,1,0,0,1,0,0,1,0,1,1
2,16,1,0,0,0,0,0,0,0,0,0
3,22,1,1,1,0,0,2,0,0,0,0
4,20,0,1,1,0,1,1,0,1,1,1


In [17]:
df_eng = pd.concat([df_eng, emo_nrc_eng], axis=1)
df_eng.head(2)

Unnamed: 0,index,id,screen_name,time,link,text,source,length_nrc,positive_nrc,negative_nrc,...,positive_nrc.1,negative_nrc.1,anger_nrc,anticipation_nrc,disgust_nrc,fear_nrc,joy_nrc,sadness_nrc,surprise_nrc,trust_nrc
0,49374,890587249372524544,auctnr1,2017-07-27T10:58:41-04:00,https://www.twitter.com/Reuters/statuses/89058...,"RT @Reuters MORE: Top U.S. general says, given...",Twitter for iPhone,22,2,0,...,2,0,0,1,0,0,0,0,0,2
1,83246,899354463055618048,SenatorTester,2017-08-20T15:36:27-04:00,https://www.twitter.com/SenatorTester/statuses...,T-minus 2 days until our first-ever Last Best ...,Twitter Web Client,18,1,0,...,1,0,0,1,0,0,1,0,1,1


### 个人数据—使用NRC情感词典

In [18]:
## 加载数据
df_american = pd.read_csv("American_story_sample_100.csv")
df_american.head(2)

Unnamed: 0,Article_id,Date,Page_number,Newspaper_name,Headline,Author,Article_body
0,4_1920-01-09_p1_sn99062049_00415624992_1920010...,1920-01-09,p1,,Presbyterian ChurchDemobilizes Service Flag,,The services at the PresbyterianChurch Sunday ...
1,23_1920-01-05_p1_sn91066782_00415627932_192001...,1920-01-05,p1,,Lumber Exports VS.. Production.,,"Only Al per, cent of the total lum.ber product..."


In [19]:
# 构建情感词列表
def build_emtion_list(lexion_content):
    Positive, Negative, Anger, Anticipation, Disgust, Fear, Joy, Sadness, Surprise, Trust = [[] for _ in range(10)]

    # 遍历每一行，将情感值为1的词语添加到相应的情感列表中
    lines = lexion_content.split('\n')
    for line in lines:
        parts = line.split('\t')
        if len(parts) == 3:  # 确保行包含三个部分
            word, emotion, value = parts
            if value == '1':
                if emotion == 'positive':
                    Positive.append(word)
                elif emotion == 'negative':
                    Negative.append(word)
                elif emotion == 'anger':
                    Anger.append(word)
                elif emotion == 'anticipation':
                    Anticipation.append(word)
                elif emotion == 'disgust':
                    Disgust.append(word)
                elif emotion == 'fear':
                    Fear.append(word)
                elif emotion == 'joy':
                    Joy.append(word)
                elif emotion == 'sadness':
                    Sadness.append(word)
                elif emotion == 'surprise':
                    Surprise.append(word)
                elif emotion == 'trust':
                    Trust.append(word)
        else:
            print(line)
            
    return Positive, Negative, Anger, Anticipation, Disgust, Fear, Joy, Sadness, Surprise, Trust

In [20]:
# 计算一条文本的情感词频，返回series
def emotion_caculate(text,Positive, Negative, Anger, Anticipation, Disgust, Fear, Joy, Sadness, Surprise, Trust):
    positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, trust = [0 for i in range(10)]
    
    text = text.lower()
    wordlist = word_tokenize(text)
    wordset = set(wordlist)
    wordfreq = []
    for word in wordset:
        freq = wordlist.count(word)
        if word in Positive:
            positive+=freq
        if word in Negative:
            negative+=freq
        if word in Anger:
            anger+=freq  
        if word in Anticipation:
            anticipation+=freq
        if word in Disgust:
            disgust+=freq
        if word in Fear:
            fear+=freq
        if word in Joy:
            joy+=freq
        if word in Sadness:
            sadness+=freq
        if word in Surprise:
            surprise+=freq
        if word in Trust:
            trust+=freq
            
    emotion_info = {
        'positive': positive,
        'negative': negative,
        'anger': anger,
        'anticipation': anticipation,
        'disgust': disgust,
        'fear':fear,
        'joy':joy,
        'sadness':sadness,
        'surprise':surprise,
        'trust':trust,
        'length':len(wordlist)
    }
    indexs = ['length', 'positive', 'negative', 'anger', 'anticipation','disgust','fear','joy','sadness','surprise','trust']
    return pd.Series(emotion_info, index=indexs)

In [21]:
# 大规模计算情感值，返回数据中全部月中每一条新闻的情感值
def main():
    with open(r'D:\研一下 课堂\文本分析\词典\emtion_analysis\NRC-Emotion-Lexicon\NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', 'r',encoding = 'utf-8') as file:
        lexion_content = file.read()
        Positive, Negative, Anger, Anticipation, Disgust, Fear, Joy, Sadness, Surprise, Trust = build_emtion_list(lexion_content)
    
    df = pd.read_csv("American_story_sample_100.csv")
    df['Article_body'] = df['Article_body'].fillna('')
    #year_month = os.path.splitext(os.path.basename(file_path))[0]
    emotion_series_list = df['Article_body'].apply(lambda text: emotion_caculate(text, Positive, Negative, Anger, Anticipation, Disgust, Fear, Joy, Sadness, Surprise, Trust))
    
    df_merged = pd.concat([df,emotion_series_list],axis = 1)
    #emotion_series_list.to_csv(f"D:\\American Stories\\the civil war\\emotion_score\\{year_month}.csv", index=False)
    
    return df_merged 

In [22]:
# 运行
if __name__ == "__main__":
    df_merged = main()




In [24]:
df_merged.head(2)

Unnamed: 0,Article_id,Date,Page_number,Newspaper_name,Headline,Author,Article_body,length,positive,negative,anger,anticipation,disgust,fear,joy,sadness,surprise,trust
0,4_1920-01-09_p1_sn99062049_00415624992_1920010...,1920-01-09,p1,,Presbyterian ChurchDemobilizes Service Flag,,The services at the PresbyterianChurch Sunday ...,273,7,11,5,2,3,5,1,4,1,3
1,23_1920-01-05_p1_sn91066782_00415627932_192001...,1920-01-05,p1,,Lumber Exports VS.. Production.,,"Only Al per, cent of the total lum.ber product...",272,9,6,1,3,1,3,0,1,0,4


## 道德词典

### 样例数据

#### 中文道德词典（cmfd)

In [6]:
# 加载数据
df_chn = pd.read_excel('text_analysis_weibo_sample.xlsx', index_col = 0)
df_chn.head(2)

Unnamed: 0,index,标题/微博内容,点赞,转发,评论,账号昵称UID加密,粉丝数,关注数,地域
0,34121,国债：地产行业重磅利好提振风险偏好，期债低开低走 国债期货全线收跌，10年期主力...,0,0,0,e5df796860e68f403bcf9651bab4d42e,0,0,其他
1,40230,#喜迎二十大 忠诚保平安#,0,0,0,6e35cb69ad52f20de5e28197b2e85306,405444,252,广西


In [8]:
chn_moral = pd.read_csv(r'https://raw.githubusercontent.com/CivicTechLab/CMFD/main/cmfd_civictech.csv')
chn_moral.head()

Unnamed: 0,chinese,foundation
0,同情,care
1,一臂之力,care
2,一见倾心,care
3,三个代表,care
4,上阵杀敌,care


In [11]:
moral_dict = chn_moral.groupby('foundation')['chinese'].apply(list).to_dict()
# moral_dict

In [12]:
def moral_quantity(text):
    
    if isinstance(text, str):
        moral_word_total = 0
        moral_word = {}
        moral_num = {}

        for key in moral_dict.keys():
            moral_word[key] = []
        for word in jieba.cut(text):
            for key in moral_dict.keys():
                if word in moral_dict[key]:
                    moral_word[key].append(word)

        for key in moral_word.keys():
            moral_word_total += len(moral_word[key])
        if moral_word_total == 0:
            return None

        for key in moral_word.keys():
            moral_num[key] = len(moral_word[key]) / moral_word_total

    return moral_num

In [13]:
df_chn['chn_moral'] = df_chn['标题/微博内容'].apply(moral_quantity)
df_chn.head()

Unnamed: 0,index,标题/微博内容,点赞,转发,评论,账号昵称UID加密,粉丝数,关注数,地域,chn_moral
0,34121,国债：地产行业重磅利好提振风险偏好，期债低开低走 国债期货全线收跌，10年期主力...,0,0,0,e5df796860e68f403bcf9651bab4d42e,0,0,其他,"{'altr': 0.0, 'auth': 0.35, 'care': 0.13333333..."
1,40230,#喜迎二十大 忠诚保平安#,0,0,0,6e35cb69ad52f20de5e28197b2e85306,405444,252,广西,"{'altr': 0.0, 'auth': 0.0, 'care': 0.0, 'dili'..."
2,7714,注意！事关明日教资考试！福建省教育考试院发布补充公告 福建省2022年下半年全国中小学教师...,0,0,0,e6953217442e6c06a7af23eee5e185f2,53264,2177,福建,"{'altr': 0.0, 'auth': 0.5833333333333334, 'car..."
3,27378,近日，“千年大计”雄安新区迎来五周岁生日。从“一张白纸...,0,0,0,,0,0,北京,"{'altr': 0.023809523809523808, 'auth': 0.45238..."
4,15435,樊振东牛逼！,0,0,0,344af41eac516375c04dee6325e763cc,8,51,山东,


In [16]:
chn_moral_df = pd.DataFrame(columns=['altr', 'auth', 'care', 'dili', 'fair', 'general', 'libe', 'loya', 'mode', 'resi', 'sanc', 'wast'])

for dc in df_chn.index:
    if df_chn['chn_moral'][dc] == None:
        chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
    else:
        chn_moral_df.loc[len(chn_moral_df.index)] = list(df_chn['chn_moral'][dc].values())
        
df_chn = pd.concat([df_chn, chn_moral_df], axis=1)
        
df_chn.head(2)

  chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
  chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
  chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
  chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
  chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
  chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
  chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
  chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
  chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
  chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
  chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
  chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
  chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
  chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
  chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
  chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
  chn_moral_df.loc[len(chn_moral_df.index)] = [None] * 12
  chn_moral_df

Unnamed: 0,index,标题/微博内容,点赞,转发,评论,账号昵称UID加密,粉丝数,关注数,地域,chn_moral,...,care,dili,fair,general,libe,loya,mode,resi,sanc,wast
0,34121,国债：地产行业重磅利好提振风险偏好，期债低开低走 国债期货全线收跌，10年期主力...,0,0,0,e5df796860e68f403bcf9651bab4d42e,0,0,其他,"{'altr': 0.0, 'auth': 0.35, 'care': 0.13333333...",...,0.133333,0.0,0.133333,0.016667,0.0,0.25,0.0,0.066667,0.05,0.0
1,40230,#喜迎二十大 忠诚保平安#,0,0,0,6e35cb69ad52f20de5e28197b2e85306,405444,252,广西,"{'altr': 0.0, 'auth': 0.0, 'care': 0.0, 'dili'...",...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [17]:
df_chn.columns

Index(['index', '标题/微博内容', '点赞', '转发', '评论', '账号昵称UID加密', '粉丝数', '关注数', '地域',
       'chn_moral', 'altr', 'auth', 'care', 'dili', 'fair', 'general', 'libe',
       'loya', 'mode', 'resi', 'sanc', 'wast', 'altr', 'auth', 'care', 'dili',
       'fair', 'general', 'libe', 'loya', 'mode', 'resi', 'sanc', 'wast',
       'altr', 'auth', 'care', 'dili', 'fair', 'general', 'libe', 'loya',
       'mode', 'resi', 'sanc', 'wast'],
      dtype='object')

#### 英文道德词典（moralstrength）

In [18]:
# 加载数据
df_eng = pd.read_excel('text_analysis_twitter_sample.xlsx', index_col = 0)
df_eng.head()

Unnamed: 0,index,id,screen_name,time,link,text,source
0,49374,890587249372524544,auctnr1,2017-07-27T10:58:41-04:00,https://www.twitter.com/Reuters/statuses/89058...,"RT @Reuters MORE: Top U.S. general says, given...",Twitter for iPhone
1,83246,899354463055618048,SenatorTester,2017-08-20T15:36:27-04:00,https://www.twitter.com/SenatorTester/statuses...,T-minus 2 days until our first-ever Last Best ...,Twitter Web Client
2,100988,903272105738985472,KeithRothfus,2017-08-31T11:03:46-04:00,https://www.twitter.com/KeithRothfus/statuses/...,Please know that help is available. Visit http...,Twitter Web Client
3,193395,921001114409021440,HASCRepublicans,2017-10-19T09:12:31-04:00,https://www.twitter.com/HASCRepublicans/status...,Literally flying the wings off the A-10 in fig...,Twitter Web Client
4,12662,884911451449774080,SteveKnight25,2017-07-11T19:05:05-04:00,https://www.twitter.com/SteveKnight25/statuses...,Today the House unanimously passed my bill #HR...,Twitter Web Client


In [20]:
# 计算道德分数
lexicon_use.select_version('latest')

df_eng_morals = estimate_morals(df_eng['text'].tolist(), process=True)
df_eng = pd.concat([df_eng, df_eng_morals], axis=1)

df_eng.head(3)



Unnamed: 0,index,id,screen_name,time,link,text,source,care,fairness,loyalty,authority,purity,care.1,fairness.1,loyalty.1,authority.1,purity.1
0,49374,890587249372524544,auctnr1,2017-07-27T10:58:41-04:00,https://www.twitter.com/Reuters/statuses/89058...,"RT @Reuters MORE: Top U.S. general says, given...",Twitter for iPhone,,,,,,,,,,
1,83246,899354463055618048,SenatorTester,2017-08-20T15:36:27-04:00,https://www.twitter.com/SenatorTester/statuses...,T-minus 2 days until our first-ever Last Best ...,Twitter Web Client,,,,,,,,,,
2,100988,903272105738985472,KeithRothfus,2017-08-31T11:03:46-04:00,https://www.twitter.com/KeithRothfus/statuses/...,Please know that help is available. Visit http...,Twitter Web Client,,,,,,,,,,


### 个人数据

In [22]:
# 加载数据
df_american = pd.read_csv(r'C:\Users\Administrator\Desktop\词典\American_story_sample_100.csv')
df_american.head(3)

Unnamed: 0,Article_id,Date,Page_number,Newspaper_name,Headline,Author,Article_body
0,4_1920-01-09_p1_sn99062049_00415624992_1920010...,1920-01-09,p1,,Presbyterian ChurchDemobilizes Service Flag,,The services at the PresbyterianChurch Sunday ...
1,23_1920-01-05_p1_sn91066782_00415627932_192001...,1920-01-05,p1,,Lumber Exports VS.. Production.,,"Only Al per, cent of the total lum.ber product..."
2,16_1920-01-09_p1_sn86063730_00332894857_192001...,1920-01-09,p1,,WILSON AND BRYANfUIESSAGESSURPRISES OF JACKSON...,,"with Germany. was thus disclosed asa fact, alt..."


In [23]:
# 计算道德分数
lexicon_use.select_version('latest')

df_american_morals = estimate_morals(df_american['Article_body'].tolist(), process=True)
df_merged = pd.concat([df_american,df_american_morals], axis=1)

df_merged.head(3)



Unnamed: 0,Article_id,Date,Page_number,Newspaper_name,Headline,Author,Article_body,care,fairness,loyalty,authority,purity
0,4_1920-01-09_p1_sn99062049_00415624992_1920010...,1920-01-09,p1,,Presbyterian ChurchDemobilizes Service Flag,,The services at the PresbyterianChurch Sunday ...,1.666667,,7.257143,7.525,5.166667
1,23_1920-01-05_p1_sn91066782_00415627932_192001...,1920-01-05,p1,,Lumber Exports VS.. Production.,,"Only Al per, cent of the total lum.ber product...",,4.833333,6.5,4.8,
2,16_1920-01-09_p1_sn86063730_00332894857_192001...,1920-01-09,p1,,WILSON AND BRYANfUIESSAGESSURPRISES OF JACKSON...,,"with Germany. was thus disclosed asa fact, alt...",,,,8.8,
