# Setup

In [None]:
# one of the NLU services
from ibm_watson import NaturalLanguageUnderstandingV1
# access control by IAM
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
import pandas as pd

In [None]:
# additional information about NLU
# multi_api_key = "[INSERT YOUR API KEY HERE]"
# multi_url = "[INSERT YOUR NLU URL HERE]"
multi_api_key = "*************************"
multi_url = "********************"

In [None]:
# setup information about NLU
multi_auth = IAMAuthenticator(multi_api_key)
multi_nlu = NaturalLanguageUnderstandingV1(
    version="2022-08-10", authenticator=multi_auth
)
multi_nlu.set_service_url(multi_url)

# Loading Training Data

In [None]:
# train_df = pd.read_csv("./NLU_TrainData_ver1.csv", header=None)
train_df = pd.read_csv("学習データ.csv", header=None, sheet_name="学習") 

In [None]:
# confirmation of dataframe training data (0:category, 1：text)
train_df = train_df[[0, 1]]

In [None]:
# remove missing values
train_df.dropna(inplace=True)

In [None]:
# get the first 100 characters of the question text and add them to the text column
train_df.loc[:, 1] = train_df[1].apply(lambda x: str(x)[:100])
train_df

In [None]:
# confirmation of the number of each category
pd.value_counts(train_df[0])

In [None]:
# double the training data (when data is scarce)
train_df = pd.concat([train_df, train_df])

In [None]:
# transform from dataframe to json
training_data = []
for val in train_df.iterrows():
    dic = {}
    dic.update({"text": val[1][1]})
    dic.update({"labels": [val[1][0]]})
    training_data.append(dic)

In [None]:
# confirmation of json training data ([{"text","label"}])
training_data

In [None]:
# save training data in a file
import json

training_data_filename = "training_data.json"

with open(training_data_filename, "w", encoding="utf-8") as f:
    json.dump(training_data, f, indent=4, ensure_ascii=False)

# Create Model


In [None]:
with open(training_data_filename, "r") as file:
    multi_label_model = multi_nlu.create_classifications_model(
        language="ja",
        training_data=file,
        training_parameters={"model_type": "multi_label"},
        training_data_content_type="application/json",
        # training_data_content_type='application/csv',
        name="MyMultiLabelClassificationsModel",
        model_version="1.0.1",
    ).get_result()
    print(json.dumps(multi_label_model, indent=4))

# Information about the created Model


In [None]:
# multi_model_id = multi_label_model['model_id']
multi_model_id = "3abc43b1-cfd2-4436-84d7-d96c6c4c1586"
multi_model_to_view = multi_nlu.get_classifications_model(
    model_id=multi_model_id
).get_result()
print(json.dumps(multi_model_to_view, indent=4))

{
    "name": "MyMultiLabelClassificationsModel",
    "user_metadata": null,
    "language": "ja",
    "description": null,
    "model_version": "1.0.1",
    "version": "1.0.1",
    "workspace_id": null,
    "version_description": null,
    "status": "available",
    "notices": [],
    "model_id": "3abc43b1-cfd2-4436-84d7-d96c6c4c1586",
    "features": [
        "classifications"
    ],
    "created": "2024-02-27T05:52:59Z",
    "last_trained": "2024-02-27T05:52:59Z",
    "last_deployed": "2024-02-27T05:55:12Z"
}


In [None]:
# # import necessary functions for classification
from ibm_watson.natural_language_understanding_v1 import (
    Features,
    ClassificationsOptions,
)

In [None]:
# import necessary functions for model evaluation
from sklearn.metrics import f1_score, accuracy_score

# Loading Test Data


In [None]:
# test_df = pd.read_csv("./Tuningdata_11month_day17~30.csv", header=None)
test_df = pd.read_csv("./テストデータ.csv", header=None, sheet_name="テスト" )

In [None]:
# remove missing values
test_df.dropna(inplace=True)

In [None]:
# confirmation of the number of each category
pd.value_counts(test_df[0])

In [None]:
# get the first 100 characters of the question text and add them to the text column
test_df.loc[:, 1] = test_df[1].apply(lambda x: str(x)[:100])
test_df

In [None]:
# from datetime import datetime
# s = datetime.now()
test_preds = [
    multi_nlu.analyze(
        text=text[1].values[1],
        features=Features(classifications=ClassificationsOptions(model=multi_model_id)),
    ).get_result()
    for text in test_df.iterrows()
]
# print(datetime.now() - s)
test_preds[0]

# output example
# {'usage': {'text_units': 1, 'text_characters': 100, 'features': 1},
#  'language': 'ja',
#  'classifications': [{'confidence': 0.973921, 'class_name': 'カテゴリ１'},
#   {'confidence': 0.010586, 'class_name': 'カテゴリ２'},
#   {'confidence': 0.009018, 'class_name': 'カテゴリ３'}]}

In [None]:
# output the category corresponding to the prediction with the highest confidence
def max_confidence(preds_list):
    confidence = 0
    class_name = ""
    for pred in preds_list:
        if pred["confidence"] > confidence:
            confidence = pred["confidence"]
            class_name = pred["class_name"]
    # return {confidence, class_name}
    return class_name

In [None]:
# output final results
def compute_final_predictions(model_preds):
    """Given a set of probabilities/confidence scores output by our model, return the final predicted labels
    that have a confidence score above a given threshold.
    """
    # Extract the class name and confidence score from the prediction object
    model_preds = [pred["classifications"] for pred in model_preds]

    # model_preds = [
    #     [pred_obj for pred_obj in pred_obj_list if pred_obj["confidence"] > 0.]
    #     for pred_obj_list in model_preds
    # ]

    # # Extract the class names
    # final_preds = [
    #     [pred_obj["class_name"] for pred_obj in pred_obj_list]
    #     for pred_obj_list in model_preds
    # ]
    # return final_preds

    return [[max_confidence(pred_obj_list)] for pred_obj_list in model_preds]

In [None]:
threshold = 0
final_preds = compute_final_predictions(test_preds)
# confirmation of the final prediction
final_preds

In [None]:
# confirmation of the correct labels on test data
test_labels = test_df[0].apply(lambda x: [str(x)]).tolist()
test_labels

# Checked Model

In [None]:
# convert list labels to binary format
from sklearn.preprocessing import MultiLabelBinarizer

# （prepare the appropriate category labels for your data）
label_names = [
    [
        "カテゴリ１",
        "カテゴリ２",
        "カテゴリ３",
        "カテゴリ４",
        "カテゴリ５",
        "カテゴリ６",
        "カテゴリ７",
        "カテゴリ８",
        "カテゴリ９",
    ]
]
MLB = MultiLabelBinarizer().fit(label_names)

In [None]:
# Transform both our true labels and the model predictions
y_true = MLB.transform(test_labels)
y_pred = MLB.transform(final_preds)

In [None]:
accuracy_score(y_true, y_pred)