In [37]:
import os
import csv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import MeCab
import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import Dense

# データセット準備

## 入力値

In [25]:
# 有価証券報告書のテキストデータを持ってくる
code_and_return_and_contetn = pd.read_csv('../datasets/datasets_for_training.csv')
docs = code_and_return_and_contetn['経営方針'].tolist()
stock_return = code_and_return_and_contetn['return'].tolist()

In [5]:
# mecabで分かち書きに
def wakachi(docs):
    processed_docs = []

    for doc in docs:
        m = MeCab.Tagger("-Ochasen")

        # 形態素解析の結果を格納するリスト
        nouns = []

        # 形態素解析の結果を解析
        node = m.parse(doc).split("\n")
        for i in node:
            if i == "EOS" or i == "":
                continue
            else:
                chunk = i.split("\t")
                pos = chunk[3].split("-")[0]  # 品詞情報がある部分を取得
                if pos == "名詞":  # 名詞のみを取得
                    nouns.append(chunk[0])
        nouns = " ".join(nouns)

        processed_docs.append(nouns)

    return processed_docs

In [51]:
def wordnet_tfidf(processed_docs):
    # モデルの生成
    vectorizer = TfidfVectorizer(smooth_idf = False)
    # TF-IDFの計算
    values = vectorizer.fit_transform(processed_docs).toarray()
    # 特徴量ラベルの取得
    words = vectorizer.get_feature_names()

    #結果のプリント
    print(values)
    print(type(values))
    print(values.shape)
    print(words)
    print(type(words))

    
    # CSVファイルのパスを指定
    input_dir = os.path.join("..", "datasets")
    input_file = os.path.join(input_dir, "keywords_word2vec.csv") # keywords_mecabもしくはkeywords_wordnetを選ぶ

    # CSVファイルを読み込み、リストに変換
    keywords = []

    with open(input_file, 'r', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            keywords.append(row[0])

    tfidf_before_keywords = pd.DataFrame(values, columns=words)

    tfidf = pd.DataFrame()

    for col in keywords:
        if col in tfidf_before_keywords.columns:
            tfidf[col] = tfidf_before_keywords[col]
        else:
            tfidf[col] = 0

    tfidf = tfidf.fillna(0)

    return tfidf
    

In [52]:
wakachi_docs = wakachi(docs)
wordnet_tfidf = wordnet_tfidf(wakachi_docs)

[[0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 ...
 [0.        0.1164114 0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]]
<class 'numpy.ndarray'>
(567, 13705)
['00', '000', '0000', '0001', '000100', '00011', '00013', '000136', '00014', '000141', '000145', '000147', '00016', '000162', '000170', '000180', '000185', '00019', '0002', '000200', '00022', '00023', '00027', '0003', '00032', '000360', '000367', '0004', '00040', '000409', '00041', '00042', '00045', '000472', '00049', '0005', '000500', '00056', '0006', '00060', '0008', '000800', '00082', '001', '002', '003', '0030', '004', '006', '0083', '01', '010', '011', '0112', '012', '0122', '012274', '0132', '01341', '014', '0147', '015', '0151', '017', '018',

  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = 0
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = 0
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = tfidf_before_keywords[col]
  tfidf[col] = tfidf_before_keywords[col]


In [55]:
wordnet_tfidf

Unnamed: 0,10,11,12,13,14,15,16,17,18,19,...,280,281,282,283,284,285,288,290,295,298
0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.079549,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.047186,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
563,0.014643,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
564,0.000000,0.000000,0.000000,0.0,0.034374,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
565,0.000000,0.161726,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
wordnet_tfidf = wordnet_tfidf.loc[:, (wordnet_tfidf != 0).any(axis=0)]

## 目標値

In [26]:
stock_return = [1 if i > 0 else 0 for i in stock_return]
stock_return = np.array(stock_return)
stock_return = pd.DataFrame(stock_return)
# 一列目のカラム名を取得
first_column_name = stock_return.columns[0]
# 一列目のカラム名を'return'に変更
stock_return = stock_return.rename(columns={first_column_name: 'return'})

In [27]:
stock_return

Unnamed: 0,return
0,1
1,1
2,0
3,1
4,1
...,...
562,0
563,1
564,1
565,1


# データセットの調整

## データセットを上昇下降半分半分にする

In [56]:
# 0と1のそれぞれの個数をカウント
count_0 = stock_return[stock_return['return'] == 0].shape[0]
count_1 = stock_return[stock_return['return'] == 1].shape[0]

# 小さい方の個数に合わせてランダムサンプリング
min_count = min(count_0, count_1)

sampled_0 = stock_return[stock_return['return'] == 0].sample(n=min_count)
sampled_1 = stock_return[stock_return['return'] == 1].sample(n=min_count)

# ランダムサンプリングしたデータを結合
balanced_stock_return = pd.concat([sampled_0, sampled_1]).sort_index()
balanced_stock_return = balanced_stock_return.reset_index(drop=True)

# wordnet_tfidfも同じインデックスのものだけを抜き出す
balanced_wordnet_tfidf = wordnet_tfidf.loc[balanced_stock_return.index]
balanced_wordnet_tfidf = balanced_wordnet_tfidf.reset_index(drop=True)

## 最終調整

In [57]:
x_train, x_test, y_train, y_test = train_test_split(balanced_wordnet_tfidf, balanced_stock_return, test_size=0.3, random_state=1) 

# 学習・予測

## モデル学習

In [58]:
xgb_model = XGBClassifier()
xgb_model.fit(x_train, y_train)
Y_pred_xgb = xgb_model.predict(x_test)
print(accuracy_score(y_test, Y_pred_xgb))

0.5042016806722689


In [59]:
# モデルの定義
model = Sequential()
model.add(Dense(8, input_dim=x_train.shape[1], activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))  # 出力層　回帰では恒等関数、２クラス分類ではシグモイド、多クラス分類ではソフトマックス

# モデルのコンパイル
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# 訓練
model.fit(x_train, y_train, epochs=1, batch_size=10)

# 評価
loss, accuracy = model.evaluate(x_test, y_test)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

predictions = model.predict(x_test)
predicted_labels = np.argmax(predictions, axis=1)
print(predicted_labels)

Loss:  0.0
Accuracy:  0.5210084319114685
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0]


## 実際に予測

In [None]:
datasets_for_predicting = pd.read_csv('../datasets/datasets_for_predicting.csv')

In [None]:
datasets_for_predicting = datasets_for_predicting.drop_duplicates(subset='name')

In [None]:
datasets_for_predicting = datasets_for_predicting.reset_index(drop=True)

In [None]:
pred_docs = datasets_for_predicting['経営方針'].tolist()

In [None]:
processed_pred_docs = []

for doc in pred_docs:
    m = MeCab.Tagger("-Ochasen")

    # 形態素解析の結果を格納するリスト
    nouns = []

    # 形態素解析の結果を解析
    node = m.parse(doc).split("\n")
    for i in node:
        if i == "EOS" or i == "":
            continue
        else:
            chunk = i.split("\t")
            pos = chunk[3].split("-")[0]  # 品詞情報がある部分を取得
            if pos == "名詞":  # 名詞のみを取得
                nouns.append(chunk[0])
    # 重要！！！！　freaquencyの時は下の１行を消す。tfidfの時は下の１行必要！！！doc2vecの時も必要！！テキストがカンマで区切られるか、それとも半角スペースで区切られるか
    nouns = " ".join(nouns)

    processed_pred_docs.append(nouns)

print(processed_pred_docs[0])

１ 経営 方針 経営 環境 対処 課題 等 文中 将来 事項 有価 証券 報告 書 提出 日 現在 当社 グループ 当社 連結 子会社 判断 もの ( 1 ) 会社 経営 基本 方針 当社 グループ 創造 工夫 前進 品質 ため 創造 奉仕 顧客 利益 社会 建設 ため 奉仕 協力 私 達 幸福 心 結び つき ため 協力 経営 理念 実現 事業 目的 世界 未来 企業 ビジョン 当社 グループ 世界 未来 企業 ため 企業 社会 人 調和 中 存在 認識 もと 地域 社会 国際 社会 発展 貢献 地球 環境 保全 事業 活動 推進 全て ステーク ホルダー 期待 企業 価値 最大 化 こと 経営 方針 タダノ グループ ＣＳＲ 憲章 ）( 2 ) 経営 環境 当社 グループ 当社 2022 年 ６月 24 日 開催 74 回 定時 株主 総会 定款 一部 変更 件 承認 こと 今期 決算 期 事業 年度 末日 ３月 31 日 12 月 31 日 変更 決算 期 統一 連結 会計 年度 決算 期 変更 経過 期間 当社 ３月 決算 連結 対象 子会社 ９ か月 間 2022 年 ４月 １ 日 ～ 2022 年 12 月 31 日 12 月 決算 連結 対象 子会社 12 か月 間 2022 年 １月 １ 日 ～ 2022 年 12 月 31 日 連結 対象 期間 変則 決算 連結 会計 年度 わが国 経済 新型 コロナ ウイルス 感染 拡大 防止 行動 制限 緩和 経済 活動 正常 化 こと 持ち直し 動き 海外 経済 活動 再開 段階 的 景気 緩やか 回復 ロシア ウクライナ 問題 長期 化 中国 ロック ダウン 急激 インフレ ・ 円 安 進行 原材料 価格 高騰 調達 物流 環境 悪化 私 ども 業界 調達 環境 悪化 生産 影響 出荷 遅れ 日本 大型 公共 工事 中心 順調 稼働 生産 出荷 遅れ 需要 減少 海外 緩やか 景気 回復 背景 全て 地域 需要 増加 傾向 よう 経営 環境 中 当社 グループ 調達 環境 悪化 影響 最小限 よう 販売 価格 見直し 経費 節減 等 電動 化 環境 対応 はじめ 製品 開発 DX 推進 注力 2022 年 暦年 建設 用 クレーン 地域 別 需要 台数 新型 コロナ ウイルス 感染 症 影響 前 推移 以下

In [None]:
# tfidf

# モデルの生成
vectorizer = TfidfVectorizer(smooth_idf = False)
 
# TF-IDFの計算
pred_values = vectorizer.fit_transform(processed_pred_docs).toarray()
 
# 特徴量ラベルの取得
pred_words = vectorizer.get_feature_names()


#結果のプリント
print(pred_values)
print(type(pred_values))
print(pred_values.shape)
print(pred_words)
print(type(pred_words))

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.01790739 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
<class 'numpy.ndarray'>
(2412, 29563)
<class 'list'>


In [None]:
# CSVファイルのパスを指定
input_dir = os.path.join("..", "datasets")
input_file = os.path.join(input_dir, "keywords_wordnet.csv") # keywords_mecabもしくはkeywords_wordnetを選ぶ

# CSVファイルを読み込み、リストに変換
keywords = []

with open(input_file, 'r', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        keywords.append(row[0])

In [None]:
pred_tfidf_before_keywords = pd.DataFrame(pred_values, columns=pred_words)

pred_tfidf = pd.DataFrame()

for col in keywords:
    if col in pred_tfidf_before_keywords.columns:
        pred_tfidf[col] = pred_tfidf_before_keywords[col]
    else:
        pred_tfidf[col] = 0

pred_tfidf = pred_tfidf.fillna(0)
# tfidf = tfidf.values
# print(type(tfidf))
# print(tfidf.shape)
# tfidf[np.isnan(tfidf)] = 0

In [None]:
pred_tfidf

In [None]:
pred_xgb = xgb_model.predict(pred_tfidf)

In [None]:
indices = [i for i, value in enumerate(pred_xgb) if value == 1]
print(indices)

[0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 83, 84, 85, 86, 87, 88, 89, 91, 92, 93, 94, 95, 96, 98, 99, 100, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 138, 139, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 152, 153, 154, 155, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 194, 195, 197, 198, 199, 201, 202, 203, 204, 205, 206, 207, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 232, 233, 234, 235, 236, 238, 239, 240, 243, 244, 245,

In [None]:
print(len(indices))
index = indices

2142


In [None]:
first_selected_company = datasets_for_predicting.loc[index]

In [None]:
first_selected_company.to_csv('first_selected_company.csv')

In [None]:
print(first_selected_company['name'].to_list())