### 预制环境和导入模型

In [9]:
import pickle
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer

# Load pre-trained models and vectorizers
cEXT = pickle.load(open("data/models/cEXT.p", "rb"))
cNEU = pickle.load(open("data/models/cNEU.p", "rb"))
cAGR = pickle.load(open("data/models/cAGR.p", "rb"))
cCON = pickle.load(open("data/models/cCON.p", "rb"))
cOPN = pickle.load(open("data/models/cOPN.p", "rb"))
vectorizer_31 = pickle.load(open("data/models/vectorizer_31.p", "rb"))
vectorizer_30 = pickle.load(open("data/models/vectorizer_30.p", "rb"))

def predict_personality(text):
    # Split text into sentences
    sentences = re.split("(?<=[.!?]) +", text)  # 更正变量名，文本内容通过参数传递
    
    # Transform sentences into feature vectors
    text_vector_31 = vectorizer_31.transform(sentences)
    text_vector_30 = vectorizer_30.transform(sentences)
    
    # Predict probabilities for each personality dimension
    prob_EXT = cEXT.predict_proba(text_vector_31)
    prob_NEU = cNEU.predict_proba(text_vector_30)
    prob_AGR = cAGR.predict_proba(text_vector_31)
    prob_CON = cCON.predict_proba(text_vector_31)
    prob_OPN = cOPN.predict_proba(text_vector_31)
    
    # Extract the confidence scores (probability of the predicted class)
    confidence_EXT = prob_EXT.max()
    confidence_NEU = prob_NEU.max()
    confidence_AGR = prob_AGR.max()
    confidence_CON = prob_CON.max()
    confidence_OPN = prob_OPN.max()
    
    return [confidence_EXT, confidence_NEU, confidence_AGR, confidence_CON, confidence_OPN]

#### 根据gvkey和year对content进行predict

In [12]:
# 读取 merged_content.xls 文件
file_path = 'merged_gvkeyandyear.xls'
df = pd.read_excel(file_path)

# 创建一个列表来存储预测结果
results = []

# 遍历每一行并进行个性预测
for index, row in df.iterrows():
    text = row['content']  # 假设文本内容存储在 'content' 列
    predictions = predict_personality(text)
    
    # 将预测结果和其他信息保存到结果列表
    results.append({
        'gvkey': row['gvkey'],  # 假设存在 'gvkey' 列
        'year': row['year'],    # 假设存在 'year' 列
        'EXT': predictions[0],
        'NEU': predictions[1],
        'AGR': predictions[2],
        'CON': predictions[3],
        'OPN': predictions[4]
    })

# 将结果转换为 DataFrame
results_df = pd.DataFrame(results)

# 保存结果到新的 Excel 文件
output_file = 'personality_gvkeyandyear.xls'
results_df.to_excel(output_file, index=False)

print(f"预测结果已保存到 {output_file}")

预测结果已保存到 personality_gvkeyandyear.xls


#### 根据datacqtr和gvkey对content进行predict

In [14]:
# 读取 merged_content.xls 文件
file_path = 'merged_datacqtrandgvkey.xls'
df = pd.read_excel(file_path)

# 创建一个列表来存储预测结果
results = []

# 遍历每一行并进行个性预测
for index, row in df.iterrows():
    text = row['content']  # 假设文本内容存储在 'content' 列
    predictions = predict_personality(text)
    
    # 将预测结果和其他信息保存到结果列表
    results.append({
        'datacqtr': row['datacqtr'],  # 假设存在 'datacqtr' 列
        'gvkey': row['gvkey'],    # 假设存在 'gvkey' 列
        'EXT': predictions[0],
        'NEU': predictions[1],
        'AGR': predictions[2],
        'CON': predictions[3],
        'OPN': predictions[4]
    })

# 将结果转换为 DataFrame
results_df = pd.DataFrame(results)

# 保存结果到新的 Excel 文件
output_file = 'personality_datacqtrandgvkey.xls'
results_df.to_excel(output_file, index=False)

print(f"预测结果已保存到 {output_file}")

预测结果已保存到 personality_datacqtrandgvkey.xls


In [5]:
# text = 'It is important to note that each of the five personality factors represents a range between two extremes. For example, extraversion represents a continuum between extreme extraversion and extreme introversion. In the real world, most people lie somewhere in between the two polar ends of each dimension. These five categories are usually described as follows.'

In [6]:
# Extraversion='I feel an overwhelming surge of excitement when surrounded by a crowd; my energy is contagious, and I thrive on the vibrant atmosphere of social gatherings.'

In [7]:
# predictions = predict_personality(text)
# print("predicted personality:", predictions)
# df = pd.DataFrame(dict(r=predictions, theta=['EXT','NEU','AGR', 'CON', 'OPN']))
# fig = px.line_polar(df, r='r', theta='theta', line_close=True)
# # # fig.show()
# # fig.write_image("personality_plot.png", format="png")

predicted personality: [0.5636255180008869, 0.58, 0.6629771396018606, 0.554767820751249, 0.7000641385621587]
