### calculate the sentiment (positive/negative) of a text based on the scores of words in a dictionary

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path = '/content/drive/MyDrive/Kenkyu/Finance/Data/'
%cd $path
%ls

In [None]:
import csv
import re
from collections import defaultdict

def load_words_and_scores_from_csv(file_path):
    words_and_scores = {}
    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file, delimiter=',')
        for row in reader:
            word_id = row['']
            word = row['word']
            score = float(row['score'])
            words_and_scores[word] = {'id': word_id, 'score': score}
    return words_and_scores

def count_words_and_sum_scores_in_text(words_and_scores, text):
    total_positive_score = 0
    total_negative_score = 0
    positive_word_count = 0
    negative_word_count = 0
    word_counts = defaultdict(int)
    matched_words_and_scores = []

    sorted_words = sorted(words_and_scores.keys(), key=len, reverse=True)

    remaining_text = text
    for word in sorted_words:
        score = words_and_scores[word]['score']
        pattern = re.compile(re.escape(word))
        matches = pattern.findall(remaining_text)
        count = len(matches)
        if count > 0:
            word_counts[word] += count
            total_score = count * score
            matched_words_and_scores.append((word, score))
            if total_score > 0:
                total_positive_score += total_score
                positive_word_count += count
            elif total_score < 0:
                total_negative_score += total_score
                negative_word_count += count
            remaining_text = pattern.sub("", remaining_text)

    return total_positive_score, total_negative_score, positive_word_count, negative_word_count, total_positive_score + total_negative_score, matched_words_and_scores

#### テキスト分析

In [None]:
def analyze_csv(input_csv, words_and_scores_csv, output_csv, matched_words_output_csv):
    words_and_scores = load_words_and_scores_from_csv(words_and_scores_csv)

    with open(input_csv, mode='r', encoding='utf-8') as infile, open(output_csv, mode='w', encoding='utf-8', newline='') as outfile, open(matched_words_output_csv, mode='w', encoding='utf-8', newline='') as matched_file:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)
        matched_writer = csv.writer(matched_file)

        writer.writerow(['name', '報告日', '正の単語数', '正のスコア合計', '負の単語数', '負のスコア合計', 'スコア合計'])
        matched_writer.writerow(['name', '報告日', 'word', 'score'])

        remove_string = "３【経営者による財政状態、経営成績及びキャッシュ・フローの状況の分析】 "

        next(reader)

        for row in reader:
            name = row[0]
            date = row[2]
            text = row[4]

            if text.startswith(remove_string):
                text = text[len(remove_string):]

            total_positive_score, total_negative_score, positive_word_count, negative_word_count, total_score, matched_words_and_scores = count_words_and_sum_scores_in_text(words_and_scores, text)

            writer.writerow([name, date, positive_word_count, total_positive_score, negative_word_count, total_negative_score, total_score])

            for word, score in matched_words_and_scores:
                matched_writer.writerow([name, date, word, score])

In [None]:
input_csv = '/content/drive/MyDrive/Kenkyu/Finance/2024/data/nikkei225_2021_2022.csv'
words_and_scores_csv = '/content/drive/MyDrive/Kenkyu/Finance/2024/data/polarity_dic_News.csv'
output_csv = '/content/drive/MyDrive/Kenkyu/Finance/2024/data/04_sentiment_dic.csv'
matched_words_output_csv = '/content/drive/MyDrive/Kenkyu/Finance/2024/data/04_matched_words.csv'

analyze_csv(input_csv, words_and_scores_csv, output_csv, matched_words_output_csv)
print(f"Analysis complete. Results are saved in {output_csv} and {matched_words_output_csv}")