#### [General Comment at the top of the file]
#### This script performs sentiment analysis using a dictionary-based approach.
#### It reads a text, counts the occurrences of positive and negative words from a predefined polarity dictionary,
#### and calculates an overall sentiment score.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install sudachipy sudachidict_core

In [None]:
import pandas as pd
import re
from sudachipy import tokenizer, dictionary

# Sudachi
tokenizer_obj = dictionary.Dictionary().create()
mode = tokenizer.Tokenizer.SplitMode.C
MAX_TEXT_LENGTH = 10000

def load_word_list(word_list_csv):
    df = pd.read_csv(word_list_csv)
    word_list = df[['word', 'score']].set_index('word').to_dict()['score']
    return word_list

def preprocess_text(text):
    text = text[1:]
    text = re.sub(r'【.*?】', '', text)
    return text

def chunk_text(text, max_length):
    return [text[i:i+max_length] for i in range(0, len(text), max_length)]

def analyze_text(text, word_list):
    word_count = {}
    chunks = chunk_text(text, MAX_TEXT_LENGTH)
    for chunk in chunks:
        tokens = tokenizer_obj.tokenize(chunk, mode)
        for token in tokens:
            word = token.surface()
            if word in word_list:
                if word in word_count:
                    word_count[word] += 1
                else:
                    word_count[word] = 1
    return word_count

def analyze_line_by_line(text, word_list):
    word_count = {}
    lines = text.split('\n')
    for line in lines:
        line = preprocess_text(line)
        line_word_count = analyze_text(line, word_list)
        for word, count in line_word_count.items():
            if word in word_count:
                word_count[word] += count
            else:
                word_count[word] = count
    return word_count

def calculate_scores(word_count, word_list):
    pos_count = neg_count = pos_score = neg_score = 0
    for word, count in word_count.items():
        score = word_list[word] * count
        if score > 0:
            pos_count += count
            pos_score += score
        elif score < 0:
            neg_count += count
            neg_score += score
    total_score = pos_score + neg_score
    return pos_count, pos_score, neg_count, neg_score, total_score

def main(word_list_csv, text_csv, output_csv):
    word_list = load_word_list(word_list_csv)
    df = pd.read_csv(text_csv)

    results = []
    for i, row in enumerate(df.iterrows()):
        row_data = row[1]
        text = row_data[5]
        word_count = analyze_line_by_line(text, word_list)
        pos_count, pos_score, neg_count, neg_score, total_score = calculate_scores(word_count, word_list)
        results.append([
            row_data[0], row_data[1], row_data[2], row_data[3], row_data[4], row_data[6], row_data[7],
            pos_count, pos_score, neg_count, neg_score, total_score
        ])
        if i % 100 == 0:
            print(f"Processing row {i}")

    # English header for the output CSV:
    # ['name', 'ticker', 'nkcode', 'Reporting Date', 'Closing Date', 'year', 'month',
    #  'Positive Word Count', 'Total Positive Score', 'Negative Word Count', 'Total Negative Score', 'Total Score']
    result_df = pd.DataFrame(results, columns=[
        'name', 'ticker', 'nkcode', '報告日', '決算日', 'year', 'month',
        '正の単語数', '正のスコア合計', '負の単語数', '負のスコア合計', 'スコア合計'
    ])
    result_df.to_csv(output_csv, index=False)
    print(f"Results saved to {output_csv}")

In [None]:
word_list_csv = 'polarity_dic_News.csv'
text_csv = 'MDA_DataSet_2014_2022_TSE1.csv'
output_csv = 'MDA_DataSet_2014_2022_TSE1_dic.csv'

main(word_list_csv, text_csv, output_csv)