<a href="https://colab.research.google.com/github/zahrahrp/multilabel-class/blob/master/Ensemble_Learning_Class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Multilabel Classification of Doctor's Specialties on Medical Inquiry Documents**

## Import Module

In [None]:
!pip install --upgrade --no-cache-dir gdown
!pip install PySastrawi
!pip install xmltodict
import pandas as pd
import numpy as np
import xmltodict,json
import re

Collecting gdown
  Downloading gdown-4.5.3.tar.gz (14 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: gdown
  Building wheel for gdown (pyproject.toml) ... [?25ldone
[?25h  Created wheel for gdown: filename=gdown-4.5.3-py3-none-any.whl size=14821 sha256=cb90ce79ff96ba7e6d0e5bb6f592f3da184eda2a11bc3afc9c0de22e37135aa3
  Stored in directory: /private/var/folders/hj/zwmhvfpx0nd1_rjl0lcpymjc0000gn/T/pip-ephem-wheel-cache-gkj82_td/wheels/2e/bf/38/838f7a301971b6fa2915069198ae7b48a21833d156ef170960
Successfully built gdown
Installing collected packages: gdown
Successfully installed gdown-4.5.3
Collecting PySastrawi
  Using cached PySastrawi-1.2.0-py2.py3-none-any.whl (210 kB)
Installing collected packages: PySastrawi
Successfully installed PySastrawi-1.2.0


## Read dataset

### Helper Method

In [None]:
def open_xml_file(filename):
    with open(filename) as xml_file:
      data_file = xmltodict.parse(xml_file.read())

    xml_file.close()
    data_file = data_file.get('KORPUS')
    data_file = data_file.get('DOK')
    return data_file

In [None]:
def convert_xml_to_json(data):
  json_data = json.dumps(data)
  res_data = json.loads(json_data)
  return res_data

In [None]:
def print_json(json_object):
    res = json.dumps(json_object, indent=2)
    print(res)

In [None]:
def preprocess(document):
    cleaned_doc = []
    for sent in document:
        tmp = []
        for token in sent:
            if re.match("[a-zA-Z]+", token):
                tmp.append(token)
        cleaned_doc.append(" ".join(tmp))                
    return cleaned_doc

In [None]:
def make_dataframe(json_data):
  tmp_data = []
  for json_obj in json_data:
      class1, class2, class3, class4, class5, class6, class7, class8, class9, class10, class11, class12, class13, class14, class15, class16 = 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
      if ('CLASS' in json_obj):
        label = (json_obj['CLASS'])['LABEL']
        if type(label) == str:
          label = [label]
        if label is not None: 
          for item in label:
            if item == 'Kebidanan dan Kandungan' : 
              class1 = 1 
            elif item == 'Penyakit Dalam':
              class2 = 1
            elif item == 'Kesehatan Anak':
              class3 = 1
            elif item =='Kesehatan Kulit dan Kelamin':
              class4 = 1
            elif item == 'Kesehatan Gizi':
              class5 = 1
            elif item =='Kesehatan Telinga, Hidung dan Tenggorokan (THT)':
              class6 = 1
            elif item == 'Gigi':
              class7 = 1
            elif item =='Kesehatan Mata':
              class8 = 1
            elif item == 'Bedah':
              class9 = 1
            elif item == 'Kesehatan Jiwa':
              class10 = 1
            elif item == 'Ortopedi (Tulang)':
              class11 = 1
            elif item == 'Jantung dan Pembuluh Darah':
              class12 = 1
            elif item == 'Urologi':
              class13 = 1
            elif item =='Saraf':
              class14 = 1
            elif item =='Pulmonologi (Paru)':
              class15 = 1
            elif item =='Umum':
              class16 = 1
      id = json_obj['ID']
      judul = (json_obj['PERTANYAAN'])['JUDUL']
      isi = (json_obj['PERTANYAAN'])['ISI']
      if ('CLASS' in json_obj):
        tmp_data.append([id, judul, isi, class1, class2, class3, class4, class5, class6, class7, class8, class9, class10, class11, class12, class13, class14, class15, class16])
      else :
        tmp_data.append([id, judul, isi])

  if ('CLASS' in json_obj):
    df_data = pd.DataFrame(tmp_data, columns=['ID', 'JUDUL', 'ISI', 'class1', 'class2', 'class3', 'class4', 'class5', 'class6', 'class7', 'class8', 'class9', 'class10', 'class11', 'class12', 'class13', 'class14', 'class15', 'class16'])
  else:
    df_data = pd.DataFrame(tmp_data, columns=['ID', 'JUDUL', 'ISI'])
  return df_data

### Open dataset

##### Training Dataset

###### Machine Annotated Labeled Data

In [None]:
data_file = open_xml_file('/Users/zahrahputri/Downloads/Dataset/machine_annotated_labeled_data_v1.xml')

In [None]:
data = convert_xml_to_json(data_file)

In [None]:
df_data = make_dataframe(data)
df_data.head()

Unnamed: 0,ID,JUDUL,ISI,class1,class2,class3,class4,class5,class6,class7,class8,class9,class10,class11,class12,class13,class14,class15,class16
0,AD-10,gatal-gatal dan bintik merah di kelamin,dok saya cowok ko alat kelamin saya gatal ya ....,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1
1,AD-15,sering buang air kecil,selamat siang dok . sudah beberapa minggu ini ...,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0
2,AD-28,penyebab nyeri dada,assalammu alaikum . salam sejahtera bagi kita ...,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1
3,AD-33,penanganan luka digigit anjing,"malam dok , tadi saya digigit anjing tetapi an...",0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0
4,AD-36,insomnia karena banyak pikiran,"malam dok , sebelum nya knlin dok nma saya riw...",0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [None]:
# Cek missing value
print(df_data.isnull().sum())
print(df_data.shape)

ID         0
JUDUL      0
ISI        4
class1     0
class2     0
class3     0
class4     0
class5     0
class6     0
class7     0
class8     0
class9     0
class10    0
class11    0
class12    0
class13    0
class14    0
class15    0
class16    0
dtype: int64
(11526, 19)


In [None]:
df_data_clean = df_data.copy()

In [None]:
df_data_clean.replace(np.nan, '', inplace=True)

In [None]:
print(df_data_clean.isnull().sum())
print(df_data_clean.shape)

ID         0
JUDUL      0
ISI        0
class1     0
class2     0
class3     0
class4     0
class5     0
class6     0
class7     0
class8     0
class9     0
class10    0
class11    0
class12    0
class13    0
class14    0
class15    0
class16    0
dtype: int64
(11526, 19)


###### Human annotated labeled data

In [None]:
data_file_human = open_xml_file('/Users/zahrahputri/Downloads/Dataset/human_annotated_labeled_data.xml')

In [None]:
data_human = convert_xml_to_json(data_file_human)

In [None]:
df_data_human = make_dataframe(data_human)
df_data_human.head()

Unnamed: 0,ID,JUDUL,ISI,class1,class2,class3,class4,class5,class6,class7,class8,class9,class10,class11,class12,class13,class14,class15,class16
0,DS-1,mengapa keringat badan sangat berlebihan ?,selamat malam dokter. ! saya fathurrosi umur 1...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
1,DS-23,suka menghayal dan berhalusinasi,"selamat siang dokter , saya sering kali berkha...",0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
2,DS-87,feses warna kuning di sertai darah apakah ada ...,"selamat sore dokter , feses saya warna kuning ...",0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1
3,DS-176,apakah setiap nyeri perut kanan bawah itu adal...,"selamat sore dokter , saya merasakan nyeri per...",0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
4,DS-277,10 kebiasaan buruk yang dapat merusak otak,"saya perokok berat dok , mengenal rokok sejak ...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1


In [None]:
print(df_data_human.isnull().sum())
print(df_data_human.shape)

ID          0
JUDUL       0
ISI        28
class1      0
class2      0
class3      0
class4      0
class5      0
class6      0
class7      0
class8      0
class9      0
class10     0
class11     0
class12     0
class13     0
class14     0
class15     0
class16     0
dtype: int64
(1775, 19)


In [None]:
df_data_clean_human = df_data_human.copy()

In [None]:
df_data_clean_human.replace(np.nan, '', inplace=True)

In [None]:
print(df_data_clean_human.isnull().sum())
print(df_data_clean_human.shape)

ID         0
JUDUL      0
ISI        0
class1     0
class2     0
class3     0
class4     0
class5     0
class6     0
class7     0
class8     0
class9     0
class10    0
class11    0
class12    0
class13    0
class14    0
class15    0
class16    0
dtype: int64
(1775, 19)


###### Concate two dataframe

In [None]:
df_vertical_stack = pd.concat([df_data_clean, df_data_clean_human], axis=0)

##### Testing Dataset

In [None]:
testing_data_file = open_xml_file('/Users/zahrahputri/Downloads/Dataset/testing_data_v1.xml')

In [None]:
testing_data = convert_xml_to_json(testing_data_file)

In [None]:
df_testing_data = make_dataframe(testing_data)
df_testing_data.head()

Unnamed: 0,ID,JUDUL,ISI
0,DS-45,kenapa telinga berdengung tidak bisa hilang ?,"selamat sore dokter , sudah 7 hari telinga say..."
1,DS-339,apabila anak anda lambat berbicara,"salam kenal team doktersehat , anak saya berum..."
2,AD-18,sakit perut setiap habis makan dan minum,"selamat malam dok , kurang lebih 2 minggu ini ..."
3,AD-66,cara mengobati vertigo berkepanjangan,selamat malam dok saya ingin brtnya seputar ve...
4,AD-73,imunisasi dapat dan polio,"malam dok , anak saya umur 5 tahun permpuan . ..."


In [None]:
gold_standard_df = pd.read_csv("/Users/zahrahputri/Downloads/gold_standard.csv") 
# gold_standard_df = gold_standard_df.columns[1:]
gold_standard_df

Unnamed: 0.1,Unnamed: 0,ID,Kebidanan dan Kandungan,Penyakit Dalam,Kesehatan Anak,Kesehatan Kulit dan Kelamin,Kesehatan Gizi,"Kesehatan Telinga, Hidung dan Tenggorokan (THT)",Gigi,Kesehatan Mata,Bedah,Kesehatan Jiwa,Ortopedi (Tulang),Jantung dan Pembuluh Darah,Urologi,Saraf,Pulmonologi (Paru),Umum
0,0,KD-10780,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
1,1,KD-34954,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,2,KD-42528,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
3,3,KD-25703,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,4,KD-11423,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2922,2922,KD-43404,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2923,2923,KD-43406,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2924,2924,KD-43411,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2925,2925,KD-43446,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df_testing_data = pd.merge(df_testing_data, gold_standard_df, on="ID")

In [None]:
df_testing_data.head()

Unnamed: 0.1,ID,JUDUL,ISI,Unnamed: 0,Kebidanan dan Kandungan,Penyakit Dalam,Kesehatan Anak,Kesehatan Kulit dan Kelamin,Kesehatan Gizi,"Kesehatan Telinga, Hidung dan Tenggorokan (THT)",Gigi,Kesehatan Mata,Bedah,Kesehatan Jiwa,Ortopedi (Tulang),Jantung dan Pembuluh Darah,Urologi,Saraf,Pulmonologi (Paru),Umum
0,DS-45,kenapa telinga berdengung tidak bisa hilang ?,"selamat sore dokter , sudah 7 hari telinga say...",120,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,DS-339,apabila anak anda lambat berbicara,"salam kenal team doktersehat , anak saya berum...",40,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,AD-18,sakit perut setiap habis makan dan minum,"selamat malam dok , kurang lebih 2 minggu ini ...",500,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1
3,AD-66,cara mengobati vertigo berkepanjangan,selamat malam dok saya ingin brtnya seputar ve...,501,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
4,AD-73,imunisasi dapat dan polio,"malam dok , anak saya umur 5 tahun permpuan . ...",502,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
print(df_testing_data.isnull().sum())
print(df_testing_data.shape)

ID                                                 0
JUDUL                                              0
ISI                                                6
Unnamed: 0                                         0
Kebidanan dan Kandungan                            0
Penyakit Dalam                                     0
Kesehatan Anak                                     0
Kesehatan Kulit dan Kelamin                        0
Kesehatan Gizi                                     0
Kesehatan Telinga, Hidung dan Tenggorokan (THT)    0
Gigi                                               0
Kesehatan Mata                                     0
Bedah                                              0
Kesehatan Jiwa                                     0
Ortopedi (Tulang)                                  0
Jantung dan Pembuluh Darah                         0
Urologi                                            0
Saraf                                              0
Pulmonologi (Paru)                            

In [None]:
df_testing_data_clean = df_testing_data.copy()

In [None]:
df_testing_data_clean.replace(np.nan, '', inplace=True)

In [None]:
print(df_testing_data_clean.isnull().sum())
print(df_testing_data_clean.shape)

ID                                                 0
JUDUL                                              0
ISI                                                0
Unnamed: 0                                         0
Kebidanan dan Kandungan                            0
Penyakit Dalam                                     0
Kesehatan Anak                                     0
Kesehatan Kulit dan Kelamin                        0
Kesehatan Gizi                                     0
Kesehatan Telinga, Hidung dan Tenggorokan (THT)    0
Gigi                                               0
Kesehatan Mata                                     0
Bedah                                              0
Kesehatan Jiwa                                     0
Ortopedi (Tulang)                                  0
Jantung dan Pembuluh Darah                         0
Urologi                                            0
Saraf                                              0
Pulmonologi (Paru)                            

In [None]:
df_testing_data_clean.head()

Unnamed: 0.1,ID,JUDUL,ISI,Unnamed: 0,Kebidanan dan Kandungan,Penyakit Dalam,Kesehatan Anak,Kesehatan Kulit dan Kelamin,Kesehatan Gizi,"Kesehatan Telinga, Hidung dan Tenggorokan (THT)",Gigi,Kesehatan Mata,Bedah,Kesehatan Jiwa,Ortopedi (Tulang),Jantung dan Pembuluh Darah,Urologi,Saraf,Pulmonologi (Paru),Umum
0,DS-45,kenapa telinga berdengung tidak bisa hilang ?,"selamat sore dokter , sudah 7 hari telinga say...",120,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,DS-339,apabila anak anda lambat berbicara,"salam kenal team doktersehat , anak saya berum...",40,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,AD-18,sakit perut setiap habis makan dan minum,"selamat malam dok , kurang lebih 2 minggu ini ...",500,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1
3,AD-66,cara mengobati vertigo berkepanjangan,selamat malam dok saya ingin brtnya seputar ve...,501,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
4,AD-73,imunisasi dapat dan polio,"malam dok , anak saya umur 5 tahun permpuan . ...",502,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


### Text preprocessing

In [None]:
# Fungsi untuk melakukan normalisasi pada suatu teks

def normalize(text):
    res = text.lower().strip() # Mengubah uppercase menjadi lowercase dan melakukan trimming pada teks
    res = re.sub("\d", "", res) # Menghilangkan angka
    res = re.sub("\s+", " ", res) # Menghilangkan spasi berlebih
    res = re.sub("[^\w\s]", " ", res) # Menghilangkan tanda baca
    return res

In [None]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

stop_factory = StopWordRemoverFactory()
lst_stopword = stop_factory.get_stop_words()
lst_stopword2 = ['dok', 'dokter', 'doktr', 'dkter', 'selamat', 'met', 'selamet', 'slamat', 'pagi', 'pgi', 'siang',  'sore', 'malam', 'malem', 'mlam', 'mlem', 'malm',
                 'terima', 'makasih', 'terimakasih', 'kasih', 'halo', 'hai', 'tahun', 'usia', 'bulan', 'bln', 'minggu', 'yaa', 'thnks', 'thanks', 'jawabannya',
                 'jawaban', 'soreh', 'salam', 'nanya', 'sebelumnya', 'waktunya', 'waktu', 'misi', 'permisi', 'balasannya', 'ditunggu', 'saran', 'penjelasan',
                 'solusi', 'bantuan', 'bantuannya', 'penjelasannya', 'solusinya', 'mohon', 'sarannya', 'informasi', 'assalamualaikum', 'wassalam', 'wassalammualaikum', 
                 'wr', 'wb', 'salam', 'sejahtera', 'assalammu', 'alaikum', 'dear', 'syalom', 'shalom', 'kemarin', 'kemaren']
lst_stopword = lst_stopword + lst_stopword2

# Fungsi untuk menghilangkan stopword dari suatu teks
def remove_stopword(text):
    lst_token = text.split(" ")
    res = []
    for token in lst_token:
        if token not in lst_stopword:
            res.append(token)
    return " ".join(res)

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

In [None]:
def text_preprocessing(text_list, column_name):
  for x in range(len(text_list)):
    if (text_list.iloc[x][column_name] is None):
      text_list.loc[x,[column_name]] = ''
    normalized_text = normalize(text_list.iloc[x][column_name])
    cleaned_text_stopword = remove_stopword(normalized_text)
    cleaned_text_stemmer = stemmer.stem(cleaned_text_stopword)
    text_list.loc[x,[column_name]] = cleaned_text_stemmer

In [None]:
text_preprocessing(df_data_clean, 'JUDUL')
text_preprocessing(df_data_clean, 'ISI')

In [None]:
text_preprocessing(df_testing_data_clean, 'JUDUL')
text_preprocessing(df_testing_data_clean, 'ISI')

In [None]:
# Melihat hasil pra-pemrosesan teks

for i in range(5):
    print(df_data_clean.iloc[i]['JUDUL'])
    print(df_data_clean.iloc[i]['ISI'])
    print("")

gatal gatal bintik merah kelamin
cowok ko alat kelamin gatal bintik merah pala bentol tadu si gatal paha jalar kelamin sakit obat

buang air
bolak buang air jam kamar mandi buang air periksa suruh tes urine glukosa hasil tes normal kendala infeksi mudah meriang infeksi ginjal uretra

sebab nyeri dada
anak laki laki umur konsultasi alami alami nyeri dada beda nyeri dada alami capai habis aktivitas nyeri nyeri dada alami nyeri belah kiri nyeri beda sakit jadi ulang selang alami nyeri dada belah kanan nyeri dada belah kanan alami

tangan luka gigit anjing
gigit anjing anjing vaksin akibat

insomnia pikir
knlin nma riwayat sinaga mngalami insomnia tahun llu smnjak keprgian brsamaan kjadian bsar mndera pkerjaan llu kartu keluarga msuk bui kasih ptus tnp alas jls



In [None]:
# Melihat hasil pra-pemrosesan teks

for i in range(5):
    print(df_testing_data_clean.iloc[i]['JUDUL'])
    print(df_testing_data_clean.iloc[i]['ISI'])
    print("")

telinga dengung hilang
telinga dengung dokter tht gendang telinga lubang kasih obat tetes ofloxacin dengung hilang dwi

anak lambat bicara
kenal team doktersehat anak umur bicara maem minum bilang maem binatang bilang au paham perintah laku masuk bunda rien kenal bunda rien skala dayley anak sama subyek predikat obyek contoh papa pergi denver ii anak sebut gambar kombinasi tubuh anak bicara lambat bicara laku temu spesialis anak periksa anak tangan speech therapy aju moga bantu mira della masali ked team doktersehat com

sakit perut habis makan minum
bangun minum air cc perut mulas makan nasi sendok minum obat darah tinggi resep beliau tidur bangun jam bangun tidur berani minum air minum air perut mulas buang air bentuk fasesnya encer badan gemetar tangan semut makan roti lembar minum air kondisi lapar badan gemetar makan badan semut jam beliau makan nasi sendok syaraf makan kenyang kenyang alami keluh gelisah badan gemetar kunjung umum spesialis syaraf hasil tegang milik tekan darah t

## Classification

#### Helper

In [None]:
from sklearn.metrics import accuracy_score,hamming_loss, f1_score, recall_score, precision_score, classification_report

test_label = np.asarray(df_testing_data_clean[df_testing_data_clean.columns[4:]])

def evaluate_classifier_performance(predict_result, test_label=test_label):
  print('Accuracy:', accuracy_score(test_label, predict_result))
  print('Hamming Loss:', hamming_loss(test_label, predict_result))
  print('F1 macro:', f1_score(test_label, predict_result, average='macro'))
  print('F1 micro:', f1_score(test_label, predict_result, average='micro'))
  print('Recall macro:', recall_score(test_label, predict_result, average='macro'))
  print('Recall micro:', recall_score(test_label, predict_result, average='micro'))
  print('Precision macro:', precision_score(test_label, predict_result, average='macro'))
  print('Precision micro:', precision_score(test_label, predict_result, average='micro'))

In [None]:
def convert_to_csv(predict_df, pred_res, df_data):
  predict_df['class1'] = [item[0] for item in pred_res]
  predict_df['class2'] = [item[1] for item in pred_res]
  predict_df['class3'] = [item[2] for item in pred_res]
  predict_df['class4'] = [item[3] for item in pred_res]
  predict_df['class5'] = [item[4] for item in pred_res]
  predict_df['class6'] = [item[5] for item in pred_res]
  predict_df['class7'] = [item[6] for item in pred_res]
  predict_df['class8'] = [item[7] for item in pred_res]
  predict_df['class9'] = [item[8] for item in pred_res]
  predict_df['class10'] = [item[9] for item in pred_res]
  predict_df['class11'] = [item[10] for item in pred_res]
  predict_df['class12'] = [item[11] for item in pred_res]
  predict_df['class13'] = [item[12] for item in pred_res]
  predict_df['class14'] = [item[13] for item in pred_res]
  predict_df['class15'] = [item[14] for item in pred_res]
  predict_df['class16'] = [item[15] for item in pred_res]

  predict_df['JUDUL'] = [(df_data.loc[[i]]['ID'].values)[0] for i in predict_df.index]
  return(predict_df)

#### Ensemble Learning 

##### Random Forest

###### Berdasarkan Judul Machine Annotated

In [None]:
X_j = df_data_clean['JUDUL'] 
y_j = np.asarray(df_data_clean[df_data_clean.columns[3:]])
test_j = df_testing_data_clean['JUDUL'] 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_j = TfidfVectorizer()

train_vector_j = vectorizer_j.fit_transform(X_j).toarray()
test_vector_j = vectorizer_j.transform(test_j).toarray()

In [None]:
# Membangun model klasifikasi dan melakukan evaluasi terhadap performa model
from sklearn.ensemble import RandomForestClassifier

# using Random forest classifier
rfc_j = RandomForestClassifier(n_estimators=250, random_state=2022)
rfc_j.fit(train_vector_j,y_j)


In [None]:
pred_res_j = rfc_j.predict(test_vector_j)

In [None]:
print("Berdasarkan Judul Machine Annotated")
evaluate_classifier_performance(pred_res_j)

Berdasarkan Judul Machine Annotated
Accuracy: 0.4670310898530919
Hamming Loss: 0.059062179706183805
F1 macro: 0.6412557136042739
F1 micro: 0.657164105106594
Recall macro: 0.5817162190637848
Recall micro: 0.5881961393388063
Precision macro: 0.7346410400036132
Precision micro: 0.7444538051109238


###### Berdasarkan Isi Machine Annotated

In [None]:
X_i = df_data_clean['ISI']
y_i = np.asarray(df_data_clean[df_data_clean.columns[3:]])
test_i = df_testing_data_clean['ISI']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_i = TfidfVectorizer()

train_vector_i = vectorizer_i.fit_transform(X_i).toarray()
test_vector_i = vectorizer_i.transform(test_i).toarray()

In [None]:
# Membangun model klasifikasi dan melakukan evaluasi terhadap performa model
from sklearn.ensemble import RandomForestClassifier

# using Random forest classifier
rfc_i = RandomForestClassifier(n_estimators=250, random_state=2022)
rfc_i.fit(train_vector_i,y_i)


In [None]:
pred_res_i = rfc_i.predict(test_vector_i)

In [None]:
print("Berdasarkan Isi Machine Annotated")
evaluate_classifier_performance(pred_res_i)

Berdasarkan Isi Machine Annotated
Accuracy: 0.3652203621455415
Hamming Loss: 0.06375982234369662
F1 macro: 0.5080676942166986
F1 micro: 0.5626830697129466
Recall macro: 0.3906163815286817
Recall micro: 0.4262258708675394
Precision macro: 0.823899974825145
Precision micro: 0.8276604911676002


###### Berdasarkan Judul dan Isi Machine Annotated

In [None]:
X_ji = df_data_clean['JUDUL'] + ' ' + df_data_clean['ISI']
y_ji = np.asarray(df_data_clean[df_data_clean.columns[3:]])
test_ji = df_testing_data_clean['JUDUL'] + ' ' + df_testing_data_clean['ISI']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_ji = TfidfVectorizer()

train_vector_ji = vectorizer_ji.fit_transform(X_ji).toarray()
test_vector_ji = vectorizer_ji.transform(test_ji).toarray()

In [None]:
# Membangun model klasifikasi dan melakukan evaluasi terhadap performa model
from sklearn.ensemble import RandomForestClassifier

# using Random forest classifier
rfc_ji = RandomForestClassifier(n_estimators=250, random_state=2022)
rfc_ji.fit(train_vector_ji,y_ji)


In [None]:
pred_res_ji = rfc_ji.predict(test_vector_ji)

In [None]:
print("Berdasarkan Judul dan Isi Machine Annotated")
evaluate_classifier_performance(pred_res_ji)

Berdasarkan Judul dan Isi Machine Annotated
Accuracy: 0.38503587290741376
Hamming Loss: 0.061154765971984965
F1 macro: 0.540112304533507
F1 micro: 0.5889781859931114
Recall macro: 0.42113618329480806
Recall micro: 0.45529176836032836
Precision macro: 0.8377886996912831
Precision micro: 0.8338073953677367


###### Berdasarkan Judul dan n-estimator lainnya

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc_j_2 = RandomForestClassifier(n_estimators=100, random_state=2022)
rfc_j_2.fit(train_vector_j,y_j)

In [None]:
pred_res_j_2 = rfc_j_2.predict(test_vector_j)

In [None]:
evaluate_classifier_performance(pred_res_j_2)

Accuracy: 0.4677143833276392
Hamming Loss: 0.05914759139050222
F1 macro: 0.6421238691711775
F1 micro: 0.6564128007938477
Recall macro: 0.5812812672845298
Recall micro: 0.5870867539383182
Precision macro: 0.7367498283753289
Precision micro: 0.7443037974683544


In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc_j_3 = RandomForestClassifier(n_estimators=400, random_state=2022)
rfc_j_3.fit(train_vector_j,y_j)

In [None]:
pred_res_j_3 = rfc_j_3.predict(test_vector_j)

In [None]:
evaluate_classifier_performance(pred_res_j_3)

Accuracy: 0.4683976768021865
Hamming Loss: 0.05891270925862658
F1 macro: 0.6416082083168464
F1 micro: 0.658158840292405
Recall macro: 0.5820785478117922
Recall micro: 0.5893055247392944
Precision macro: 0.7354481572960109
Precision micro: 0.745230078563412


In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc_j_4 = RandomForestClassifier(n_estimators=500, random_state=2022)
rfc_j_4.fit(train_vector_j,y_j)

In [None]:
pred_res_j_4 = rfc_j_4.predict(test_vector_j)

In [None]:
evaluate_classifier_performance(pred_res_j_4)

Accuracy: 0.46908097027673384
Hamming Loss: 0.05880594465322856
F1 macro: 0.6420096824680224
F1 micro: 0.6583974199950384
Recall macro: 0.5820378156006043
Recall micro: 0.5888617705790992
Precision macro: 0.7365649055170066
Precision micro: 0.7465541490857946


##### Gradient Boosting

In [None]:
X_j_gb = df_data_clean['JUDUL'] 
y_j_gb = np.asarray(df_data_clean[df_data_clean.columns[3:]])
test_j_gb = df_testing_data_clean['JUDUL'] 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_j_gb = TfidfVectorizer()

train_vector_j_gb = vectorizer_j_gb.fit_transform(X_j_gb).toarray()
test_vector_j_gb = vectorizer_j_gb.transform(test_j_gb).toarray()

In [None]:
# Membangun model klasifikasi dan melakukan evaluasi terhadap performa model
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multioutput import ClassifierChain

# using GradientBoostingClassifier
gbc_j_gb = GradientBoostingClassifier(n_estimators=250, random_state=2022)
chain_gbc_j_gb = ClassifierChain(gbc_j_gb)
chain_gbc_j_gb.fit(train_vector_j_gb,y_j_gb)


In [None]:
pred_res_j_gb = chain_gbc_j_gb.predict(test_vector_j_gb)

In [None]:
print("Berdasarkan Judul Machine Annotated")
evaluate_classifier_performance(pred_res_j_gb)

Berdasarkan Judul Machine Annotated
Accuracy: 0.44994875298940895
Hamming Loss: 0.05995900239152716
F1 macro: 0.638630009260186
F1 micro: 0.6445569620253165
Recall macro: 0.5827660422188881
Recall micro: 0.5648990459285556
Precision macro: 0.7352831740533567
Precision micro: 0.7503684055408193


##### AdaBoost

In [None]:
X_j_ab = df_data_clean['JUDUL'] 
y_j_ab = np.asarray(df_data_clean[df_data_clean.columns[3:]])
test_j_ab = df_testing_data_clean['JUDUL'] 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_j_ab = TfidfVectorizer()

train_vector_j_ab = vectorizer_j_ab.fit_transform(X_j_ab).toarray()
test_vector_j_ab = vectorizer_j_ab.transform(test_j_ab).toarray()

In [None]:
# Membangun model klasifikasi dan melakukan evaluasi terhadap performa model
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multioutput import ClassifierChain

# using AdaBoostClassifier
abc_j_ab = AdaBoostClassifier(n_estimators=250, random_state=2022)
chain_abc_j_ab = ClassifierChain(abc_j_ab)
chain_abc_j_ab.fit(train_vector_j_ab,y_j_ab)


In [None]:
pred_res_j_ab = chain_abc_j_ab.predict(test_vector_j_ab)

In [None]:
print("Berdasarkan Judul Machine Annotated")
evaluate_classifier_performance(pred_res_j_ab)

Berdasarkan Judul Machine Annotated
Accuracy: 0.45097369320122993
Hamming Loss: 0.06341817560642296
F1 macro: 0.6404885059170136
F1 micro: 0.6497641509433962
Recall macro: 0.6196060843859851
Recall micro: 0.6112713556689594
Precision macro: 0.6737438121952758
Precision micro: 0.6934306569343066
