In [2]:
# 1. Google Driveのマウント
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# 2. 必要なライブラリのインストールとインポート
!pip install nltk rouge sacrebleu janome matplotlib
!pip install bert-score
!pip install sacrebleu[ja]
!pip install mecab-python3
!pip install unidic-lite

Collecting rouge
  Using cached rouge-1.0.1-py3-none-any.whl (13 kB)
Collecting janome
  Using cached Janome-0.5.0-py2.py3-none-any.whl (19.7 MB)
Installing collected packages: janome, rouge
Successfully installed janome-0.5.0 rouge-1.0.1
Collecting unidic-lite
  Downloading unidic-lite-1.0.8.tar.gz (47.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: unidic-lite
  Building wheel for unidic-lite (setup.py) ... [?25l[?25hdone
  Created wheel for unidic-lite: filename=unidic_lite-1.0.8-py3-none-any.whl size=47658817 sha256=4b699bb0dce1b77d1d3505de85905a1e71bd732f27bc159e45df202553534857
  Stored in directory: /root/.cache/pip/wheels/89/e8/68/f9ac36b8cc6c8b3c96888cd57434abed96595d444f42243853
Successfully built unidic-lite
Installing collected packages: unidic-lite
Successfully installed unidic-lite-1.0.8


In [4]:
from janome.tokenizer import Tokenizer
from rouge import Rouge
from sacrebleu.metrics import BLEU, CHRF, TER
import matplotlib.pyplot as plt
import csv
from bert_score import score as bert_score

In [5]:
# 3. テキストファイルを読み込むための関数
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# 日本語のテキストのトークン化
def tokenize_japanese(text):
    tokenizer = Tokenizer()
    return " ".join(token.surface for token in tokenizer.tokenize(text))

# 4. スコア計算関数
def calculate_scores(original_text, generated_text):
    original_text_tokenized = tokenize_japanese(original_text)
    generated_text_tokenized = tokenize_japanese(generated_text)

    # BERTScoreの計算
    P, R, F1 = bert_score([generated_text_tokenized], [original_text_tokenized], lang="ja", verbose=True)
    bertscore = F1.mean().item()

    # 他のスコアの計算
    chrf = CHRF()
    bleu = BLEU(tokenize='none')
    # bleu = BLEU(tokenize='ja-mecab')  # ここを変更
    rouge = Rouge()
    chrf_plus = CHRF(word_order=1)
    chrf_plusplus = CHRF(word_order=2)

    chrf_score = chrf.corpus_score([generated_text], [original_text]).score
    # bleu_score = bleu.corpus_score([generated_text], [[original_text]]).score  # トークン化されていないテキストを使用
    # rouge_scores = rouge.get_scores(generated_text, original_text)[0]  # トークン化されていないテキストを使用
    bleu_score = bleu.corpus_score([generated_text_tokenized], [[original_text_tokenized]]).score
    rouge_scores = rouge.get_scores(original_text_tokenized, generated_text_tokenized)[0]
    chrf_plus_score = chrf_plus.corpus_score([generated_text], [original_text]).score
    chrf_plusplus_score = chrf_plusplus.corpus_score([generated_text], [original_text]).score

    return chrf_score, bleu_score, rouge_scores, bertscore, chrf_plus_score, chrf_plusplus_score




In [6]:
dir_path = "/content/drive/MyDrive/TMLlab/Users/2023/髙須賀/2023_12_調査ノート/"

In [10]:
dates = ['2022_01_01', '2022_02_01', '2022_03_01', '2022_04_01', '2022_05_01', '2022_06_01', '2022_07_01', '2022_08_01', '2022_09_01', '2022_10_01', '2022_11_01', '2022_12_01', '2022_09_17', '2022_06_21', '2022_01_06']

gpt_versions = ['v1', 'v2', 'v3', 'v4']
gemini_versions = ['v1', 'v2']
generate_ai = ['gpt', 'Gemini']

In [13]:
ai = generate_ai[0]

version = gpt_versions[0]

date = dates[3]

In [17]:
original_text = read_text_file(f'{dir_path}コメント/元のコメント/{date}_original.txt')
generated_text_path = f'{dir_path}コメント/生成したコメント/{version}_{ai}_生成されたコメント/{date}_{ai}_generate_{version}.txt'
generated_text = read_text_file(generated_text_path)

print(original_text)
print(generated_text)

中国大陸からの高気圧に覆われ西～東日本は西から次第に晴れ。北日本は気圧の谷や上空寒気の影響で雪や雨。鹿児島市、大分市、下関市、鳥取市、神戸市、前橋市でサクラ満開。
高気圧支配の晴天域、日本海側の気圧の谷による雲多い空、南岸を進む低気圧接近に伴う降水帯の発達。北部の冬型の気圧配置、各地で天気差の顕著な状況。


In [49]:
import sacrebleu
ref = "これは参照文です。"
hyp = "これは評価対象の文です。"
chrf_score = sacrebleu.sentence_chrf(hyp, [ref]).score
print(f"CHRFスコア: {chrf_score}")


CHRFスコア: 30.95451554799898


In [50]:
sacrebleu.sentence_chrf(original_text, [generated_text]).score

7.813835166295141

In [18]:
original_text_tokenized = tokenize_japanese(original_text)
generated_text_tokenized = tokenize_japanese(generated_text)

print(original_text_tokenized)

print(generated_text_tokenized)

中国 大陸 から の 高気圧 に 覆わ れ 西 ～ 東日本 は 西 から 次第に 晴れ 。 北 日本 は 気圧 の 谷 や 上空 寒気 の 影響 で 雪 や 雨 。 鹿児島 市 、 大分 市 、 下関 市 、 鳥取 市 、 神戸 市 、 前橋 市 で サクラ 満開 。
高気圧 支配 の 晴天 域 、 日本海 側 の 気圧 の 谷 による 雲 多い 空 、 南岸 を 進む 低 気圧 接近 に 伴う 降水 帯 の 発達 。 北部 の 冬型 の 気圧 配置 、 各地 で 天気 差 の 顕著 な 状況 。


In [34]:
preds = ['the cat is on the mat']
target = ['there is a cat on the mat']

In [38]:
chrf_plusplus.corpus_score(preds,target)

chrF2++ = 12.50

In [36]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.2.1-py3-none-any.whl (806 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/806.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/806.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m460.8/806.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/806.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.10.0 torchmetrics-1.2.1


In [47]:
from torchmetrics.text import CHRFScore

chrf_torch = CHRFScore()
chrf_torch(preds, target)

  total_n_grams[n] = tensor(sum(n_grams_counts[n].values()))
  matching_n_grams[n] = tensor(


tensor(0.4942)

In [48]:
chrf_torch([generated_text_tokenized], [original_text_tokenized])

tensor(0.0900)

In [43]:
chrf = CHRF(char_order=1)
chrf_plus = CHRF(word_order=1)
chrf_plusplus = CHRF(word_order=2)

In [46]:
chrf_score = chrf.corpus_score([generated_text_tokenized], [original_text_tokenized])
chrf_score

chrF2 = 0.00

In [26]:
chrf_plus.corpus_score([generated_text_tokenized], [original_text_tokenized]).score

0.0

In [27]:
chrf_plusplus.corpus_score([generated_text_tokenized], [original_text_tokenized]).score

0.0

In [19]:
calculate_scores(original_text, generated_text)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.33 seconds, 0.75 sentences/sec


(0.0,
 3.587017484362102,
 {'rouge-1': {'r': 0.22857142857142856,
   'p': 0.2222222222222222,
   'f': 0.2253521076770483},
  'rouge-2': {'r': 0.045454545454545456,
   'p': 0.04081632653061224,
   'f': 0.043010747702625154},
  'rouge-l': {'r': 0.17142857142857143,
   'p': 0.16666666666666666,
   'f': 0.16901407950803427}},
 0.7021981477737427,
 0.0,
 0.0)

In [None]:
# 5. CSVファイルに結果を保存

dates = ['2022_01_01', '2022_02_01', '2022_03_01', '2022_04_01', '2022_05_01', '2022_06_01', '2022_07_01', '2022_08_01', '2022_09_01', '2022_10_01', '2022_11_01', '2022_12_01', '2022_09_17', '2022_06_21', '2022_01_06']

gpt_versions = ['v1', 'v2', 'v3', 'v4']
gemini_versions = ['v1', 'v2']
generate_ai = ['gpt', 'Gemini']
averages = {ai: {version: {'CHRF': [], 'BLEU': [], 'ROUGE-1': [], 'ROUGE-2': [], 'ROUGE-L': [], 'BERTScore': [], 'CHRFP+': [], 'CHRFP++': []} for version in (gpt_versions if ai == 'gpt' else gemini_versions)} for ai in generate_ai}

with open('/content/drive/My Drive/研究用/2023_12_調査ノート/コメント/コメント自動評価結果.csv', 'w', newline='', encoding='utf-8') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['AI', 'Date', 'Version', 'CHRF', 'BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'BERTScore', 'CHRFP+', 'CHRFP++'])

    for ai in generate_ai:
        versions = gpt_versions if ai == 'gpt' else gemini_versions
        for version in versions:
            for date in dates:
                original_text = read_text_file(f'/content/drive/My Drive/研究用/2023_12_調査ノート/コメント/元のコメント/{date}_original.txt')
                generated_text_path = f'/content/drive/My Drive/研究用/2023_12_調査ノート/コメント/生成したコメント/{version}_{ai}_生成されたコメント/{date}_{ai}_generate_{version}.txt'
                generated_text = read_text_file(generated_text_path)

                chrf_score, bleu_score, rouge_scores, bertscore, chrf_plus_score, chrf_plusplus_score = calculate_scores(original_text, generated_text)

                # CSVファイルとColabに結果を出力
                csvwriter.writerow([ai, date, version, chrf_score, bleu_score, rouge_scores['rouge-1']['f'], rouge_scores['rouge-2']['f'], rouge_scores['rouge-l']['f'], bertscore, chrf_plus_score, chrf_plusplus_score])

                # スコアの集計
                averages[ai][version]['CHRF'].append(chrf_score)
                averages[ai][version]['BLEU'].append(bleu_score)
                averages[ai][version]['ROUGE-1'].append(rouge_scores['rouge-1']['f'])
                averages[ai][version]['ROUGE-2'].append(rouge_scores['rouge-2']['f'])
                averages[ai][version]['ROUGE-L'].append(rouge_scores['rouge-l']['f'])
                averages[ai][version]['BERTScore'].append(bertscore)
                averages[ai][version]['CHRFP+'].append(chrf_plus_score)
                averages[ai][version]['CHRFP++'].append(chrf_plusplus_score)

    # 平均スコアの計算と出力
    for ai in generate_ai:
        versions = gpt_versions if ai == 'gpt' else gemini_versions
        for version in versions:
            avg_scores = {metric: sum(scores) / len(scores) for metric, scores in averages[ai][version].items()}
            csvwriter.writerow([ai, f'{version} Average', '', avg_scores['CHRF'], avg_scores['BLEU'], avg_scores['ROUGE-1'], avg_scores['ROUGE-2'], avg_scores['ROUGE-L'], avg_scores['BERTScore'], avg_scores['CHRFP+'], avg_scores['CHRFP++']])
            print(f"{ai} {version} Average, CHRF: {avg_scores['CHRF']}, BLEU: {avg_scores['BLEU']}, ROUGE-1: {avg_scores['ROUGE-1']}, ROUGE-2: {avg_scores['ROUGE-2']}, ROUGE-L: {avg_scores['ROUGE-L']}, BERTScore: {avg_scores['BERTScore']}, CHRFP+: {avg_scores['CHRFP+']}, CHRFP++: {avg_scores['CHRFP++']}")


# 6. スコアの視覚的要約を画像で表示

# 評価指標のリストを定義（BERTScoreを追加）
metrics = ['CHRF', 'BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'BERTScore', 'CHRFP+', 'CHRFP++']

#各指標のスコアを日付ごとに表示する関数
def plot_scores1(averages, generate_ai, gpt_versions, gemini_versions):
    for metric in metrics:
        plt.figure(figsize=(10, 5))
        for ai in generate_ai:
            versions = gpt_versions if ai == 'gpt' else gemini_versions
            for version in versions:
                plt.plot(dates, averages[ai][version][metric], label=f'{ai} {version} {metric}')
        plt.xlabel('Date')
        plt.ylabel(metric)
        plt.title(f'Comparison of {metric} Scores')
        plt.xticks(rotation=45)
        plt.legend()
        plt.show()

plot_scores1(averages, generate_ai, gpt_versions, gemini_versions)

# 各指標のベストスコアを追加する関数
def plot_scores2(averages, generate_ai, gpt_versions, gemini_versions, dates):
    # chrF+ と chrF++ のベストスコアを追加
    best_scores = {'CHRF': 100, 'BLEU': 100, 'ROUGE-1': 1, 'ROUGE-2': 1, 'ROUGE-L': 1, 'BERTScore': 1, 'CHRFP+': 100, 'CHRFP++': 100}

    for metric in metrics:
        plt.figure(figsize=(10, 5))
        for ai in generate_ai:
            versions = gpt_versions if ai == 'gpt' else gemini_versions
            for version in versions:
                plt.plot(dates, averages[ai][version][metric], label=f'{ai} {version} {metric}')

        # ベストスコアのラインを描画する
        if metric in best_scores:
            plt.axhline(y=best_scores[metric], color='r', linestyle='--', label='Best Score')

        plt.xlabel('Date')
        plt.ylabel(metric)
        plt.title(f'Comparison of {metric} Scores')
        plt.xticks(rotation=45)
        plt.legend()
        plt.show()

plot_scores2(averages, generate_ai, gpt_versions, gemini_versions, dates)

#各AIとバージョンごとの平均スコアをプロットする関数
def plot_all_averages(averages, generate_ai, gpt_versions, gemini_versions):
    plt.figure(figsize=(15, 10))
    for ai in generate_ai:
        versions = gpt_versions if ai == 'gpt' else gemini_versions
        for version in versions:
            for metric in metrics:
                avg_score = sum(averages[ai][version][metric]) / len(averages[ai][version][metric])
                plt.bar(f'{ai} {version} {metric}', avg_score)

    plt.xlabel('Model and Metric')
    plt.ylabel('Average Score')
    plt.title('Average Scores of All Models and Metrics')
    plt.xticks(rotation=90)
    plt.show()

plot_all_averages(averages, generate_ai, gpt_versions, gemini_versions)

#各AIとバージョンごとに全指標の平均スコアを計算して表示する関数
def plot_combined_average_scores(averages, generate_ai, gpt_versions, gemini_versions):
    combined_averages = {}
    plt.figure(figsize=(10, 5))
    for ai in generate_ai:
        versions = gpt_versions if ai == 'gpt' else gemini_versions
        for version in versions:
            total_score = 0
            count = 0
            for metric in metrics:
                total_score += sum(averages[ai][version][metric])
                count += len(averages[ai][version][metric])
            combined_average = total_score / count if count > 0 else 0
            combined_averages[f'{ai} {version}'] = combined_average

    plt.bar(combined_averages.keys(), combined_averages.values())
    plt.xlabel('Model and Version')
    plt.ylabel('Combined Average Score')
    plt.title('Combined Average Scores for Each Model and Version')
    plt.xticks(rotation=45)
    plt.show()

plot_combined_average_scores(averages, generate_ai, gpt_versions, gemini_versions)



# 総合的な性能を評価して最も優れたモデルを選択
best_model = None
best_model_avg_score = 0

for ai in generate_ai:
    versions = gpt_versions if ai == 'gpt' else gemini_versions
    for version in versions:
        # 最新の日付における平均スコアを計算
        avg_score = sum(averages[ai][version][metric][-1] for metric in metrics) / len(metrics)
        if avg_score > best_model_avg_score:
            best_model_avg_score = avg_score
            best_model = f'{ai} {version}'

print(f'総合的に最も優れたモデルは {best_model} です。')


# 選択されたモデルの平均スコアのグラフを表示
ai, version = best_model.split()
def plot_average_scores(averages, ai, version, metrics):
    plt.figure(figsize=(10, 5))
    for metric in metrics:
        plt.plot(dates, averages[ai][version][metric], label=f'{metric}')
    plt.xlabel('Date')
    plt.ylabel('Score')
    plt.title(f'Average Scores of {ai} {version}')
    plt.xticks(rotation=45)
    plt.legend()
    plt.show()

plot_average_scores(averages, ai, version, metrics)