In [1]:
import pandas as pd
import numpy as np
import DataLoader
import utility
from sklearn.model_selection import train_test_split
from collections import Counter
import random

{'Dermatology': 'Deri ve Zührevi Hastalıkları (Cildiye)', 'Internal Medicine': 'İç Hastalıkları (Dahiliye)', 'Neurology': 'Nöroloji', 'Obstetrics & Gynecology': 'Kadın Hastalıkları ve Doğum', 'Ophthalmology': 'Göz Hastalıkları', 'Orthopaedic Surgery': 'Ortopedi ve Travmatoloji', 'Otolaryngology': 'Kulak Burun Boğaz Hastalıkları', 'Pediatrics': 'Çocuk Sağlığı ve Hastalıkları', 'Psychiatry': 'Ruh Sağlığı ve Hastalıkları', 'Radiology-Diagnostic': 'Radyoloji', 'Surgery-General': 'Genel Cerrahi', 'Urology': 'Üroloji'}


In [2]:
# this is the class dict that we use for iCliniq data classes
class_dict = {'Dermatology': 0,
 'Internal Medicine': 1,
 'Neurology': 2,
 'Obstetrics & Gynecology': 3,
 'Ophthalmology': 4,
 'Orthopaedic Surgery': 5,
 'Otolaryngology': 6,
 'Pediatrics': 7,
 'Psychiatry': 8,
 'Radiology-Diagnostic': 9,
 'Surgery-General': 10,
 'Urology': 11}

In [3]:
def preprocess(data):
    # we apply a small preprocessing along with masterPreprocessor
    data = data.replace("<p>", "")
    data = data.replace("</p>", "")
    data = data.replace("<strong>", "")
    data = data.replace("</strong>", "")
    
    return data

In [5]:
# read data
data_df = pd.read_csv("data//icliniq//iCliniq_14K//data.csv", encoding="utf8")

# remove bad data
# there are some data that includes unwanted characters
# we remove them in the following lines
indexes_to_remove = []
for i,item in enumerate(data_df["category2"]):
    if(item == "category2" or "â\x80" in item):
        indexes_to_remove.append(i)

data_df.drop(indexes_to_remove, inplace=True)
data_df.index = range(len(data_df))

# read matching data
# used for matching classes
# normally iCliniq has a lot of classes
# we match them to more general main categories by using "iCliniq_class_match.xlsx"
match = pd.read_excel("data//icliniq//iCliniq_class_match.xlsx")
match_dict = {}
for i in range(len(match)):
    match_dict[match["from"][i]] = match["to"][i]

In [6]:
data = data_df["question"]
target = data_df["category2"]

# match classes and remove those that can't be matched
# there are some data that can't be matched, we remove them
new_data = []
new_target = []
for question, category in zip(data, target):
    category = category.lower()[0:-1] 
    if category in match_dict:
        new_data.append(question)
        new_target.append(match_dict[category])
        
data = np.array(new_data.copy())
target = np.array(new_target.copy())
del new_data
del new_target

data = np.array([preprocess(a) for a in data])
data = np.hstack((data.reshape(-1,1),target.reshape(-1,1)))

In [7]:
raw_data = data[:,0].copy()
raw_target = data[:,1].copy()

In [None]:
# we preprocess data using masterPreprocessor
data, target, _ = DataLoader.DataHandler.masterPreprocessor(data,shuffle=False,classDict=class_dict, maxLength=128)

Outputs converted to numerical forms


In [None]:
# we shuffle data so it is not ordered
all_data = list(zip(data,target,raw_data,raw_target))
random.shuffle(all_data)
data, target, raw_data, raw_target = zip(*all_data)

In [None]:
# display test and train data size 0.2 - 0.8
test_size = len(data) // 5
print(test_size)
train_size = len(data) - test_size
print(train_size)

In [None]:
# Split data 0.8 to 0.2
train_data = data[0:train_size]
train_target = target[0:train_size]
test_data = data[train_size:]
test_target = target[train_size:]

In [None]:
# save data, this data is ready for neural network
np.save("data//icliniq//iCliniq_14K//icliniq_14k_train_questions", train_data)
np.save("data//icliniq//iCliniq_14K//icliniq_14k_train_target", train_target)
np.save("data//icliniq//iCliniq_14K//icliniq_14k_test_questions", test_data)
np.save("data//icliniq//iCliniq_14K//icliniq_14k_test_target", test_target)