### 预制环境和导入模型

In [18]:
import pickle
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer

# Load pre-trained models and vectorizers
cEXT = pickle.load(open("data/models/cEXT.p", "rb"))
cNEU = pickle.load(open("data/models/cNEU.p", "rb"))
cAGR = pickle.load(open("data/models/cAGR.p", "rb"))
cCON = pickle.load(open("data/models/cCON.p", "rb"))
cOPN = pickle.load(open("data/models/cOPN.p", "rb"))
vectorizer_31 = pickle.load(open("data/models/vectorizer_31.p", "rb"))
vectorizer_30 = pickle.load(open("data/models/vectorizer_30.p", "rb"))

def predict_personality(text):
    # Split text into sentences
    sentences = re.split("(?<=[.!?]) +", text)  # 更正变量名，文本内容通过参数传递
    
    # Transform sentences into feature vectors
    text_vector_31 = vectorizer_31.transform(sentences)
    text_vector_30 = vectorizer_30.transform(sentences)
    
    # Predict probabilities for each personality dimension
    prob_EXT = cEXT.predict_proba(text_vector_31)
    prob_NEU = cNEU.predict_proba(text_vector_30)
    prob_AGR = cAGR.predict_proba(text_vector_31)
    prob_CON = cCON.predict_proba(text_vector_31)
    prob_OPN = cOPN.predict_proba(text_vector_31)
    
    # Extract the confidence scores (probability of the predicted class)
    confidence_EXT = prob_EXT.max()
    confidence_NEU = prob_NEU.max()
    confidence_AGR = prob_AGR.max()
    confidence_CON = prob_CON.max()
    confidence_OPN = prob_OPN.max()
    
    return [confidence_EXT, confidence_NEU, confidence_AGR, confidence_CON, confidence_OPN]

#### 根据gvkey和year对content进行predict

In [19]:
# 读取 merged_content.xls 文件
file_path = 'merged_big_gvkeyandyear.csv'
df = pd.read_csv(file_path)
total_rows = len(df)  # 获取总行数
# 打印前 5行数据
print("\n前 10 行数据：")
print(df.head(5))


前 10 行数据：
   gvkey  year                                            content
0   1004  2010  Well, thank you very much for participating to...
1   1004  2011  Probably '07, possibly '06. Tyler, we have rec...
2   1004  2012  Well, do I see it in possibly the future, does...
3   1004  2013  No, actually commercial sales, we just checked...
4   1004  2014  So we're thinking that the low point is, this ...


In [20]:


# 创建一个列表来存储预测结果
results = []
print("开始预测，共", total_rows, "行数据...")

# 遍历每一行并进行个性预测
for index, row in df.iterrows():
     # 打印进度
    if (index + 1) % 100 == 0:
        print(f"已完成: {index + 1}/{total_rows} 行，当前进度: {((index + 1) / total_rows) * 100:.2f}%")
    
    
    text = row['content']  # 假设文本内容存储在 'content' 列
    predictions = predict_personality(text)
    
    # 打印预测结果
    if (index + 1) % 100 == 0:
        print(f"第 {index + 1} 行预测结果:", predictions)
    
    # 将预测结果和其他信息保存到结果列表
    results.append({
        'gvkey': row['gvkey'],  # 假设存在 'gvkey' 列
        'year': row['year'],    # 假设存在 'year' 列
        'EXT': predictions[0],
        'NEU': predictions[1],
        'AGR': predictions[2],
        'CON': predictions[3],
        'OPN': predictions[4]
    })

print("预测完成！")

# 将结果转换为 DataFrame
results_df = pd.DataFrame(results)

# 保存结果到新的 Excel 文件
output_file = 'personality_gvkeyandyear.xls'
results_df.to_excel(output_file, index=False)

print(f"预测结果已保存到 {output_file}")

开始预测，共 29627 行数据...
已完成: 100/29627 行，当前进度: 0.34%
第 100 行预测结果: [0.9009009265016866, 0.61, 0.8091341589545745, 0.8288344685607988, 0.9084824918471001]
已完成: 200/29627 行，当前进度: 0.68%
第 200 行预测结果: [0.9573696648215348, 0.64, 0.909744968694741, 0.8533844953070694, 0.9616953175213956]
已完成: 300/29627 行，当前进度: 1.01%
第 300 行预测结果: [0.8993664537166223, 0.6, 0.8874757399825542, 0.8570539642461943, 0.8935001618054078]
已完成: 400/29627 行，当前进度: 1.35%
第 400 行预测结果: [0.9200630317193603, 0.61, 0.9039210953698531, 0.9479091679991624, 0.9469731333508107]
已完成: 500/29627 行，当前进度: 1.69%
第 500 行预测结果: [0.9651511722921946, 0.63, 0.9484976033424057, 0.9185623016977692, 0.9511065483806512]
已完成: 600/29627 行，当前进度: 2.03%
第 600 行预测结果: [0.9355355754031426, 0.62, 0.9149058604118447, 0.8758430999530087, 0.9589917660400848]
已完成: 700/29627 行，当前进度: 2.36%
第 700 行预测结果: [0.9249940252716001, 0.61, 0.9359501255089919, 0.9380647916113939, 0.970280012398194]
已完成: 800/29627 行，当前进度: 2.70%
第 800 行预测结果: [0.8403695908073561, 0.6, 0.9053703829

#### 根据datacqtr和gvkey对content进行predict

In [2]:
import pickle
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer

# Load pre-trained models and vectorizers
cEXT = pickle.load(open("data/models/cEXT.p", "rb"))
cNEU = pickle.load(open("data/models/cNEU.p", "rb"))
cAGR = pickle.load(open("data/models/cAGR.p", "rb"))
cCON = pickle.load(open("data/models/cCON.p", "rb"))
cOPN = pickle.load(open("data/models/cOPN.p", "rb"))
vectorizer_31 = pickle.load(open("data/models/vectorizer_31.p", "rb"))
vectorizer_30 = pickle.load(open("data/models/vectorizer_30.p", "rb"))

def predict_personality(text):
    # Split text into sentences
    sentences = re.split("(?<=[.!?]) +", text)  # 更正变量名，文本内容通过参数传递
    
    # Transform sentences into feature vectors
    text_vector_31 = vectorizer_31.transform(sentences)
    text_vector_30 = vectorizer_30.transform(sentences)
    
    # Predict probabilities for each personality dimension
    prob_EXT = cEXT.predict_proba(text_vector_31)
    prob_NEU = cNEU.predict_proba(text_vector_30)
    prob_AGR = cAGR.predict_proba(text_vector_31)
    prob_CON = cCON.predict_proba(text_vector_31)
    prob_OPN = cOPN.predict_proba(text_vector_31)
    
    # Extract the confidence scores (probability of the predicted class)
    confidence_EXT = prob_EXT.max()
    confidence_NEU = prob_NEU.max()
    confidence_AGR = prob_AGR.max()
    confidence_CON = prob_CON.max()
    confidence_OPN = prob_OPN.max()
    
    return [confidence_EXT, confidence_NEU, confidence_AGR, confidence_CON, confidence_OPN]

# 读取 merged_content.xls 文件
file_path = 'merged_big_dataacqtrandgvkey.csv'
df1 = pd.read_csv(file_path)
total_rows = len(df1)  # 获取总行数
# 打印前 10 行数据
print("\n前 10 行数据：")
print(df1.head(10))


前 10 行数据：
  datacqtr  gvkey                                            content
0   2010Q1   1004  Well, thank you very much for participating to...
1   2010Q1   1559  Lenny, this is Dr. Bates. My interpretation, a...
2   2010Q1   1585  Well, 110 would be -- start occurring each mon...
3   2010Q1   1633  Thank you, Dalton. You're talking about the --...
4   2010Q1   1706  Yes, we've been hiring in Mexico where we move...
5   2010Q1   2184  And I'd also just call out that we are not sor...
6   2010Q1   2269  Yes, probably it's in the broader category of ...
7   2010Q1   2316  Yes, we definitely see a more positive trend a...
8   2010Q1   2435  No, they were up about 1%. Maybe Don and I wil...
9   2010Q1   2802  That's still very much part of the plan. I'm p...


In [3]:


# 创建一个列表来存储预测结果
results = []
print("开始预测，共", total_rows, "行数据...")

# 遍历每一行并进行个性预测
for index, row in df1.iterrows():
     # 打印进度
    if (index + 1) % 1000 == 0:
        print(f"已完成: {index + 1}/{total_rows} 行，当前进度: {((index + 1) / total_rows) * 100:.2f}%")
    
    text = row['content']  # 假设文本内容存储在 'content' 列
    predictions = predict_personality(text)
    
    # 打印预测结果
    if (index + 1) % 1000 == 0:
        print(f"第 {index + 1} 行预测结果:", predictions)
    
    # 将预测结果和其他信息保存到结果列表
    results.append({
        'datacqtr': row['datacqtr'],  # 假设存在 'datacqtr' 列
        'gvkey': row['gvkey'],    # 假设存在 'gvkey' 列
        'EXT': predictions[0],
        'NEU': predictions[1],
        'AGR': predictions[2],
        'CON': predictions[3],
        'OPN': predictions[4]
    })

print("预测完成！")

# 将结果转换为 DataFrame
results_df = pd.DataFrame(results)

# 保存结果到新的 Excel 文件
output_file = 'personality_predictions.xls'
results_df.to_excel(output_file, index=False)

print(f"预测结果已保存到 {output_file}")

开始预测，共 96390 行数据...
已完成: 1000/96390 行，当前进度: 10.37%
第 1000 行预测结果: [0.7265893892991122, 0.57, 0.7851471003834809, 0.7172840460148477, 0.8505939430362677]
已完成: 2000/96390 行，当前进度: 20.75%
第 2000 行预测结果: [0.8809165165345221, 0.64, 0.8977125739518264, 0.8941849357713617, 0.9278056078314424]
已完成: 3000/96390 行，当前进度: 31.12%
第 3000 行预测结果: [0.9454343842965985, 0.6, 0.8426315539657571, 0.8934765052910029, 0.9237062772943455]
已完成: 4000/96390 行，当前进度: 41.50%
第 4000 行预测结果: [0.9345657793101493, 0.64, 0.879452393402092, 0.8245586872214659, 0.91635571532277]
已完成: 5000/96390 行，当前进度: 51.87%
第 5000 行预测结果: [0.8566897513616483, 0.56, 0.7938349138598756, 0.6883474317139149, 0.9010119911121205]
已完成: 6000/96390 行，当前进度: 62.25%
第 6000 行预测结果: [0.7583484377920748, 0.6, 0.7281740436153002, 0.6584559996748549, 0.8581527305402024]
已完成: 7000/96390 行，当前进度: 72.62%
第 7000 行预测结果: [0.9270665626812369, 0.61, 0.898609385432595, 0.8602060668079526, 0.9649914756277632]
已完成: 8000/96390 行，当前进度: 83.00%
第 8000 行预测结果: [0.92209907302469

ValueError: row index was 65536, not allowed by .xls format

In [5]:
# 将结果转换为 DataFrame
results_df = pd.DataFrame(results)

# 保存结果到新的 Excel 文件
output_file = 'personality_datacqtrandgvkey.csv'
results_df.to_csv(output_file, index=False)

print(f"预测结果已保存到 {output_file}")

预测结果已保存到 personality_datacqtrandgvkey.csv
