In [2]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt') # https://www.nltk.org/api/nltk.tokenize.punkt.html
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from scipy.spatial.distance import squareform, pdist
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
import gensim
import logging
logging.basicConfig(format='(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import lightgbm as lgb

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nakatayuki/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Import Data

In [3]:
train_df = pd.read_csv(f'data/train.csv')
test_df = pd.read_csv(f'data/test.csv')
sample_submit_df = pd.read_csv(f'data/sample_submit.csv')

In [4]:
"""
特定の記号を空白に置換し、英数字以外を除去して小文字に統一するためのテキストクリーニング関数
"""
def clean_text(text):
    list_of_cleaning_signs = ['\x0c', '\n']
    for sign in list_of_cleaning_signs:
        text = text.replace(sign, ' ')

    # `re.sub()` では
    # 第一引数に正規表現パターン、
    # 第二引数に置換先文字列、
    # 第三引数に処理対象の文字列を指定する。
    clean_text = re.sub('[^a-zA-Z]+', ' ', text)
    return clean_text.lower()

"""
文にトークン化し、さらに単語にトークン化して、
英字のみをフィルタし、それぞれの単語を基本形に変換する

re.searchの利用例：

```
s = 'aaa@xxx.com bbb@yyy.net ccc@zzz.org'

print(re.search(r'[a-z]+@[a-z]+\.net', s))
# <re.Match object; span=(12, 23), match='bbb@yyy.net'>

print(re.search(r'[a-z]+@[a-z]+\.[a-z]+', s))
# <re.Match object; span=(0, 11), match='aaa@xxx.com'>
```

"""
def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


"""
TF-IDFスコアが高い上位N個の特徴を選択する
"""
def top_tfidf_feats(row, terms, top_n=25):
    top_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [terms[i] for i in top_ids]
    return top_feats


"""
テキストからTF-IDFによるキーワード抽出を行う。
ここでトークナイザーとしてステミング関数を使用
"""
def extract_tfidf_keywords(texts, top_n=25):
    tfidf_vectorizer = TfidfVectorizer(
        max_df=0.95, max_features=2000000,
        min_df=0.05, stop_words="english",
        use_idf=True, tokenizer=tokenize_and_stem,
        ngram_range=(1,3)
    )

    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    terms = tfidf_vectorizer.get_feature_names_out()

    arr = []
    for i in range(0, tfidf_matrix.shape[0]):
        row = np.squeeze(tfidf_matrix[i].toarray())
        feats = top_tfidf_feats(row, terms, top_n)
        arr.append(feats)
    return arr, tfidf_vectorizer


def create_tfidf_features_df(text):
    tfidf_vectorizer = TfidfVectorizer(
        max_df=0.95, max_features=2000000,
        min_df=0.05, stop_words="english",
        use_idf=True, tokenizer=tokenize_and_stem,
        ngram_range=(1,3)
    )

    tfidf_matrix = tfidf_vectorizer.fit_transform(papers_data['Abstract_clean'])
    terms = tfidf_vectorizer.get_feature_names_out()
    df_features = pd.DataFrame(
        tfidf_matrix.toarray(),
        columns=tfidf_vectorizer.get_feature_names_out()
    )

    return df_features

def document_vector(doc):
    """文書に含まれる単語のベクトルの平均を計算"""
    return np.mean([word2vec_model.wv[word] for word in doc if word in word2vec_model.wv], axis=0)


## Pre-Processing

In [5]:
papers_data = train_df

papers_data['abstract'] = papers_data['abstract'].fillna('')

# clean text
papers_data['Title_clean'] = papers_data['title'].apply(lambda x:clean_text(x))
papers_data['Abstract_clean'] = papers_data['abstract'].apply(lambda x:clean_text(x))

# tf-idf
abstract2kw, abstract2kw_vectorizer = extract_tfidf_keywords(papers_data['Abstract_clean'], 20)
title2kw, title2kw_vectorizer = extract_tfidf_keywords(papers_data['Title_clean'], 20)

# 文書のベクトル表現を特徴量化する
abstract2kw_features_df = create_tfidf_features_df(papers_data['Abstract_clean'])
title2kw_features_df = create_tfidf_features_df(papers_data['Title_clean'])



In [None]:
"""
step two:
word2vec representation
"""
word2vec_model = gensim.models.Word2Vec(abstract2kw+title2kw, vector_size=100, window=5, workers=4)

(asctime)s : INFO : collecting all words and their counts
(asctime)s : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
(asctime)s : INFO : PROGRESS: at sentence #10000, processed 200000 words, keeping 324 word types
(asctime)s : INFO : PROGRESS: at sentence #20000, processed 400000 words, keeping 324 word types
(asctime)s : INFO : PROGRESS: at sentence #30000, processed 600000 words, keeping 324 word types
(asctime)s : INFO : PROGRESS: at sentence #40000, processed 800000 words, keeping 324 word types
(asctime)s : INFO : PROGRESS: at sentence #50000, processed 1000000 words, keeping 324 word types
(asctime)s : INFO : collected 324 word types from a corpus of 1085800 raw words and 54290 sentences
(asctime)s : INFO : Creating a fresh vocabulary
(asctime)s : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 324 unique words (100.00% of original 324, drops 0)', 'datetime': '2024-02-14T09:15:28.704711', 'gensim': '4.3.2', 'python': '3.9.18 | pac

In [None]:
"""
step three:
average top-n keywords vectors and compute similarities

このコードは、論文の抽象要旨（abstract）とタイトル（title）から抽出したキーワードを用いて、各論文のベクトル表現（doc2vecs）を生成し、それらの間の類似度を計算しています。プロセスは以下のステップに分けられます：

1. キーワードベクトルの平均化:
  - 各論文について、抽象要旨とタイトルから抽出したキーワードリスト（abstract2kwとtitle2kw）があります。
  - それぞれのキーワードについて、gensimのWord2Vecモデル（word2vec_model）を使用して、単語の埋め込みベクトルを取得します。このモデルは、単語を多次元空間上のベクトルとして表現し、単語の意味的な類似性を捉えます。
  - キーワードがWord2Vecモデルの語彙に含まれている場合、そのベクトルを取得し、論文のベクトル表現を計算するために使用します。論文のベクトル表現は、抽象要旨とタイトルのキーワードベクトルの平均で表されます。
  - 初期ベクトル（vec）は、100次元のゼロベクトルで始まり、キーワードのベクトルが見つかるたびに加算されます。この例では、ベクトルの次元数が100であると仮定していますが、実際の次元数はWord2Vecモデルによって異なります。

2. 類似度の計算:
  - すべての論文についてベクトル表現を計算した後、scipyライブラリのpdist関数を使用して、論文間のコサイン類似度を計算します。pdist関数は、与えられたベクトルのペア間の距離（この場合は類似度の逆）を計算します。
  - squareform関数を使用して、pdistから得られる距離ベクトルを距離行列に変換します。この行列の各要素は、対応する論文ペア間の類似度を表します。
  - このコードの主な目的は、論文の内容を基にして類似した論文を特定することです。キーワードの意味的な情報を利用することで、論文のトピックや内容が似ている度合いを定量的に評価することができます。
"""

doc2vecs = []
for i in range(len(abstract2kw)):
    vec = np.zeros(100)  # ベクトルの初期化をNumPy配列で行う
    for word in abstract2kw[i]:
        if word in word2vec_model.wv.key_to_index:  # 単語がモデルの語彙に含まれているかチェック
            vec += word2vec_model.wv[word]  # 単語ベクトルを加算

    for word in title2kw[i]:
        if word in word2vec_model.wv.key_to_index:  # 単語がモデルの語彙に含まれているかチェック
            vec += word2vec_model.wv[word]  # 単語ベクトルを加算

    doc2vecs.append(vec / (len(abstract2kw[i]) + len(title2kw[i])))  # 平均ベクトルを計算してリストに追加

similarities = squareform(pdist(doc2vecs, 'cosine'))

In [None]:
# word2vecsの出力を特徴量として利用するためにDataframeにする
df_features = pd.DataFrame(doc2vecs)

# 全てのdataframeを結合する
all_features_df = pd.concat([df_features, abstract2kw_features_df, title2kw_features_df], axis=1)

## LightGBM実装 with train Data



In [None]:
# 元データと、論文のベクトル化した値とtf-idfの結果を結合
lgb_train_concat_df = pd.concat([all_features_df, papers_data], axis=1)

# lightGBMにて学習する際に不要なカラムを削除
lgb_train_df = lgb_train_concat_df.drop(["title", "Title_clean", "abstract", "Abstract_clean", "id"], axis=1)


In [None]:
# 重複する列を削除する
lgb_train_df = lgb_train_df.loc[:, ~lgb_train_df.columns.duplicated()]

In [None]:

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(lgb_train_df, lgb_train_df["judgement"], test_size=0.2, random_state=42)

# LightGBMのデータセットに変換
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# パラメータの設定
params = {
    'objective': 'binary',  # 2クラス分類の場合
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'verbose': -1
}

# モデルの訓練
verbose_eval = 0  # この数字を1にすると学習時のスコア推移がコマンドライン表示される
bst = lgb.train(
    params,
    train_data,
    valid_sets=[test_data],
    num_boost_round=100,
    callbacks=[lgb.early_stopping(stopping_rounds=10,
                  verbose=True), # early_stopping用コールバック関数
                  lgb.log_evaluation(verbose_eval)] # コマンドライン出力用コールバック関数
    )

# テストデータでの予測
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
# 予測結果を二値（0または1）に変換
y_pred_binary = np.where(y_pred > 0.5, 1, 0)

# 精度の評価
accuracy = accuracy_score(y_test, y_pred_binary)
print(f'Accuracy: {accuracy}')

Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.000278031
Accuracy: 1.0


# test_dfの分類

## Pre-Process test data

In [None]:
# test_papers_data = test_df

# test_papers_data['abstract'] = test_papers_data['abstract'].fillna('')

# # clean text
# test_papers_data['Title_clean'] = test_papers_data['title'].apply(lambda x:clean_text(x))
# test_papers_data['Abstract_clean'] = test_papers_data['abstract'].apply(lambda x:clean_text(x))

# # tf-idf
# test_abstract2kw = extract_tfidf_keywords(test_papers_data['Abstract_clean'], 20)
# test_title2kw = extract_tfidf_keywords(test_papers_data['Title_clean'], 20)

# # 文書のベクトル表現を特徴量化する
# test_abstract2kw_features_df = create_tfidf_features_df(test_papers_data['Abstract_clean'])
# test_title2kw_features_df = create_tfidf_features_df(test_papers_data['Title_clean'])

# test_doc2vecs = []
# for i in range(len(test_abstract2kw)):
#     vec = np.zeros(100)  # ベクトルの初期化をNumPy配列で行う
#     for word in test_abstract2kw[i]:
#         if word in word2vec_model.wv.key_to_index:  # 単語がモデルの語彙に含まれているかチェック
#             vec += word2vec_model.wv[word]  # 単語ベクトルを加算

#     for word in test_title2kw[i]:
#         if word in word2vec_model.wv.key_to_index:  # 単語がモデルの語彙に含まれているかチェック
#             vec += word2vec_model.wv[word]  # 単語ベクトルを加算

#     test_doc2vecs.append(vec / (len(test_abstract2kw[i]) + len(test_title2kw[i])))  # 平均ベクトルを計算してリストに追加


# # word2vecsの出力を特徴量として利用するためにDataframeにする
# test_df_features = pd.DataFrame(test_doc2vecs)

# # 全てのdataframeを結合する
# test_all_features_df = pd.concat([test_df_features, test_abstract2kw_features_df, test_title2kw_features_df], axis=1)

# # test_all_features_df.drop(["title", "Title_clean", "abstract", "Abstract_clean", "id"], axis=1)

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import linear_kernel

# 模擬的な関数定義
def clean_text(text):
    # ここでは簡単なテキストクリーニングを想定
    return text.lower().replace('[^a-zA-Z0-9]', ' ')

def extract_tfidf_keywords(vectorizer, texts, n_keywords=20):
    tfidf_matrix = vectorizer.transform(texts)
    keywords_list = []
    for row in tfidf_matrix:
        row = row.toarray().flatten()
        top_n_idxs = row.argsort()[-n_keywords:]
        keywords = [vectorizer.get_feature_names_out()[i] for i in top_n_idxs]
        keywords_list.append(keywords)
    return keywords_list

def create_tfidf_features_df(vectorizer, texts):
    tfidf_matrix = vectorizer.transform(texts)
    return pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

def compute_average_document_vectors(abstract_keywords, title_keywords, model):
    doc_vectors = []
    for abstract_kw, title_kw in zip(abstract_keywords, title_keywords):
        vec = np.zeros(100)  # ベクトルの初期化
        total_keywords = abstract_kw + title_kw  # abstractとtitleのキーワードを結合
        for word in total_keywords:
            if word in model.wv.key_to_index:
                vec += model.wv[word]  # 単語ベクトルを加算
        if len(total_keywords) > 0:  # ゼロ除算を避ける
            vec = vec / len(total_keywords)  # 平均ベクトルを計算
        doc_vectors.append(vec)
    return pd.DataFrame(doc_vectors)

In [None]:
# テキストのクリーニング
test_df['Title_clean'] = test_df['title'].apply(clean_text)
test_df['Abstract_clean'] = test_df['abstract'].fillna('').apply(clean_text)

# TF-IDFによるキーワード抽出
test_abstract2kw = extract_tfidf_keywords(abstract2kw_vectorizer, test_df['Abstract_clean'])
test_title2kw = extract_tfidf_keywords(title2kw_vectorizer, test_df['Title_clean'])

In [None]:

"""_summary_
文書（この場合は特定のテキストデータ、例えば学術論文の要旨やタイトルなど）の集合から、それぞれの文書を表すベクトルを計算しています。
このベクトルは、文書に含まれるキーワード（単語）のword2vecモデルによるベクトル表現の平均を使って表されます。
"""
av_docs_vectors = compute_average_document_vectors(test_abstract2kw, test_title2kw, word2vec_model)


In [None]:
# TF-IDF特徴量のDataFrameを生成
test_abstract2kw_features_df = create_tfidf_features_df(abstract2kw_vectorizer, test_df['Abstract_clean'])
test_title2kw_features_df = create_tfidf_features_df(title2kw_vectorizer, test_df['Title_clean'])

In [None]:
# 全てのdataframeを結合
test_all_features_df = pd.concat([av_docs_vectors, test_abstract2kw_features_df, test_title2kw_features_df], axis=1)


In [None]:
test_all_features_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,imag,infect,pandem,patient,s,s diseas,sar,sar cov,studi,use
0,0.745421,-0.321927,-0.171053,-0.624917,-0.138151,0.374303,-0.178311,0.043034,0.183868,-0.345037,...,0.000000,0.000000,1.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
1,1.125790,-0.406097,-0.320660,-1.540032,-0.284056,0.372809,-0.082244,-0.346677,0.228191,-0.190758,...,0.000000,0.804634,0.0,0.593771,0.0,0.0,0.0,0.0,0.0,0.000000
2,0.905572,-0.171291,-0.179164,-1.134469,0.031615,0.163562,-0.017225,0.010576,-0.005821,-0.208857,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
3,0.723309,-0.202652,-0.250788,-0.928157,0.057504,0.110192,0.035143,0.128007,0.073718,-0.145347,...,0.702299,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
4,0.903870,-0.040816,-0.220616,-0.893634,-0.357550,0.222926,-0.085565,0.163929,0.304022,-0.602310,...,0.000000,0.000000,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40829,0.582124,-0.309731,-0.076677,-0.293893,-0.151358,0.419562,-0.312761,-0.012050,0.185578,-0.368718,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,1.0,0.000000
40830,0.886113,-0.022179,-0.406186,-0.779884,-0.176329,0.171071,-0.048075,0.216161,0.283598,-0.484630,...,0.000000,0.000000,0.0,0.415273,0.0,0.0,0.0,0.0,0.0,0.590885
40831,0.880154,-0.163144,-0.290660,-1.163871,-0.270422,0.024451,0.014113,-0.052489,0.091535,-0.244365,...,0.693962,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.720011
40832,1.125790,-0.406097,-0.320660,-1.540032,-0.284056,0.372809,-0.082244,-0.346677,0.228191,-0.190758,...,1.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000


In [None]:
# テストデータでの予測
y_pred = bst.predict(
    test_all_features_df, 
    num_iteration=bst.best_iteration,
    predict_disable_shape_check=True # lightGBMに使用されたデータと、test_all_features_dfのカラム数が違うのでエラーが出るため一旦無視する
)
# 予測結果を二値（0または1）に変換
y_pred_binary = np.where(y_pred > 0.5, 1, 0)

y_pred_binary.to_csv()

NameError: name 'bst' is not defined

In [None]:
# 学習データとテストデータのカラムセットを取得
train_columns = set(train_df.columns)
test_columns = set(test_all_features_df.columns)

# テストデータに不足しているカラムを見つける
missing_columns = train_columns - test_columns

# 不足しているカラムを表示
print("不足しているカラム:")
for column in missing_columns:
    print(column)

不足しているカラム:
id
Abstract_clean
Title_clean
judgement
abstract
title
