In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn import cross_validation

import xgboost
import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None
seed = 101

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
data = pd.read_csv('input/data_challenge.csv',index_col=0)

In [3]:
data = shuffle(data,random_state=seed)
data.head()

Unnamed: 0,DX,AGE,MONTH,SEX,Absent,Present
9697,c_0036,38.0,2,0,s_0553,"s_0542, s_0290, s_0837, s_0084, s_1266, s_0180"
31843,c_0499,55.0,8,0,,"s_1547, s_0653"
63932,c_0152,35.0,10,0,"s_0070, s_0653, s_0039, s_0078, s_2734, s_0837...","s_1611, s_0445, s_0542, s_0084, s_0315, s_0478..."
69507,c_0145,26.0,6,1,"s_0106, s_0542, s_0124, s_0084, s_0180","s_0136, s_0290, s_2734"
64367,c_0036,61.0,4,0,"s_0070, s_0180, s_0553, s_0039, s_1611, s_0136...","s_0227, s_0078, s_0786, s_0084, s_0356, s_0739"


In [4]:
data.dropna(subset=['Present'],inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66392 entries, 9697 to 45919
Data columns (total 6 columns):
DX         66392 non-null object
AGE        66392 non-null float64
MONTH      66392 non-null int64
SEX        66392 non-null int64
Absent     60523 non-null object
Present    66392 non-null object
dtypes: float64(1), int64(2), object(3)
memory usage: 3.5+ MB


In [5]:
dataset = data.values

In [6]:
Y = dataset[:,0]
X = dataset[:,1:]

In [7]:
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)

In [8]:
test_size = 0.2
x_train, x_test, y_train, y_test = cross_validation.train_test_split(X, label_encoded_y, test_size=test_size, random_state=seed)

In [9]:
std = np.std(x_train[:,0])
mean = np.mean(x_train[:,0])
x_train[:,0] -= mean
x_train[:,0] /= std

In [10]:
x_test[:,0] -= mean
x_test[:,0] /= std

In [11]:
def cyclic(month):
    month_circum  = 12
    month_x = np.sin(2 * np.pi * month / month_circum)
    month_y = np.cos(2 * np.pi * month / month_circum)
    return month_x,month_y

In [12]:
cyclic_vec = np.vectorize(cyclic)
result = cyclic_vec(x_train[:,1])
month_x = result[0].reshape(result[0].shape[0], 1)
month_y = result[1].reshape(result[1].shape[0], 1)

In [13]:
x_train = np.concatenate((x_train, month_x, month_y), axis=1)

In [14]:
x_train = np.delete(x_train, 1, 1)
x_train

array([[-1.4985396817813654, 1,
        's_0911, s_0553, s_0812, s_0136, s_0002, s_0542, s_0837, s_0864, s_0022, s_0085, s_1266, s_0180',
        's_0400, s_2742, s_0106, s_0084', 0.8660254037844388,
        -0.4999999999999998],
       [0.27927642334497416, 0, nan, 's_2742, s_0605, s_0553, s_0777',
        -0.4999999999999998, -0.8660254037844388],
       [-1.202236997593642, 1, 's_0824, s_0542, s_0363',
        's_0445, s_0180', -1.0, -1.8369701987210297e-16],
       ...,
       [0.03235751985520477, 0, 's_0106, s_0542',
        's_0445, s_0864, s_0084', 0.8660254037844388,
        -0.4999999999999998],
       [1.6126385021897287, 0, 's_2738, s_0605', 's_0824',
        -2.4492935982947064e-16, 1.0],
       [-1.3997721203854576, 1, 's_0553', 's_0270, s_0954, s_1759',
        0.49999999999999994, 0.8660254037844387]], dtype=object)

In [15]:
present_symptoms = x_train[:,3]
absent_symptoms = x_train[:,2]

In [16]:
def tokenizer(arr):
    token_index = {}
    output = []
    for i,seq in enumerate(arr):
        if seq == '' or pd.isnull(seq):
            pass
        else:
            for word in seq.split(','):
                word = word.strip()
                if word not in output:
                    output.append(word)
                    token_index[word] = len(token_index) + 1
    return token_index

In [17]:
present_token = tokenizer(present_symptoms)
absent_token = tokenizer(absent_symptoms)

In [18]:
def one_hot(samples,token):
    results = np.zeros(shape=(len(samples),max(token.values()) + 1))
    for i, sample in enumerate(samples):
        if sample == '' or pd.isnull(sample):
            pass
        else:
            for j, word in list(enumerate(sample.split(','))):
                word = word.strip()
                index = token.get(word,None)
                if index:
                    results[i, index] = 1
    return results

In [19]:
present_encoded = one_hot(present_symptoms,present_token)

In [20]:
absent_encoded = one_hot(absent_symptoms,absent_token)

In [21]:
absent_encoded.shape

(53113, 571)

In [22]:
x_train

array([[-1.4985396817813654, 1,
        's_0911, s_0553, s_0812, s_0136, s_0002, s_0542, s_0837, s_0864, s_0022, s_0085, s_1266, s_0180',
        's_0400, s_2742, s_0106, s_0084', 0.8660254037844388,
        -0.4999999999999998],
       [0.27927642334497416, 0, nan, 's_2742, s_0605, s_0553, s_0777',
        -0.4999999999999998, -0.8660254037844388],
       [-1.202236997593642, 1, 's_0824, s_0542, s_0363',
        's_0445, s_0180', -1.0, -1.8369701987210297e-16],
       ...,
       [0.03235751985520477, 0, 's_0106, s_0542',
        's_0445, s_0864, s_0084', 0.8660254037844388,
        -0.4999999999999998],
       [1.6126385021897287, 0, 's_2738, s_0605', 's_0824',
        -2.4492935982947064e-16, 1.0],
       [-1.3997721203854576, 1, 's_0553', 's_0270, s_0954, s_1759',
        0.49999999999999994, 0.8660254037844387]], dtype=object)

In [23]:
x_train = np.delete(x_train, (2,3), 1)  

In [24]:
x_train = np.concatenate((x_train,present_encoded,absent_encoded),axis=1)

In [25]:
x_train

array([[-1.4985396817813654, 1, 0.8660254037844388, ..., 0.0, 0.0, 0.0],
       [0.27927642334497416, 0, -0.4999999999999998, ..., 0.0, 0.0, 0.0],
       [-1.202236997593642, 1, -1.0, ..., 0.0, 0.0, 0.0],
       ...,
       [0.03235751985520477, 0, 0.8660254037844388, ..., 0.0, 0.0, 0.0],
       [1.6126385021897287, 0, -2.4492935982947064e-16, ..., 0.0, 0.0,
        0.0],
       [-1.3997721203854576, 1, 0.49999999999999994, ..., 0.0, 0.0, 0.0]],
      dtype=object)

In [28]:
# fit model no training data
model = xgboost.XGBClassifier()
model.fit(x_train, y_train)
print(model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)


In [31]:
test_result = cyclic_vec(x_test[:,1])
test_month_x = test_result[0].reshape(test_result[0].shape[0], 1)
test_month_y = test_result[1].reshape(test_result[1].shape[0], 1)
x_test = np.concatenate((x_test, test_month_x, test_month_y), axis=1)
x_test = np.delete(x_test, 1, 1)

In [32]:
x_test

array([[0.5755791075326974, 0,
        's_0911, s_2197, s_0847, s_0553, s_0812, s_0865, s_1611, s_0136, s_0002, s_0242, s_1030, s_0542, s_0701, s_0084, s_0826, s_1547, s_0022, s_0085, s_0309, s_1266, s_0180',
        's_0106, s_0227, s_0400, s_0837, s_0356, s_0864',
        -2.4492935982947064e-16, 1.0],
       [-1.3503883396875038, 0, 's_0106, s_0837, s_0180',
        's_0553, s_0002, s_0246, s_0120, s_0542, s_0290, s_0084, s_2742, s_0022',
        -2.4492935982947064e-16, 1.0],
       [0.328660204042928, 0,
        's_0070, s_0812, s_1050, s_0506, s_0136, s_0242, s_0078, s_0837, s_0363, s_2253, s_0864, s_0180',
        's_0084', 0.8660254037844388, -0.4999999999999998],
       ...,
       [0.03235751985520477, 0, 's_0596',
        's_0553, s_2282, s_2755, s_0084, s_0022, s_0605, s_0824, s_0180',
        0.49999999999999994, 0.8660254037844387],
       [-0.856550532707965, 0,
        's_0070, s_1050, s_0400, s_0242, s_0002, s_0124, s_0542, s_0837, s_0084, s_0826, s_0309, s_1266, s_018

In [34]:
test_present_symptoms = x_test[:,3]
test_absent_symptoms = x_test[:,2]
test_present_encoded = one_hot(test_present_symptoms,present_token)
test_absent_encoded = one_hot(test_absent_symptoms,absent_token)

In [36]:
x_test = np.delete(x_test, (2,3), 1)
x_test = np.concatenate((x_test,test_present_encoded,test_absent_encoded),axis=1)

In [39]:
x_train.shape

(53113, 1255)

In [40]:
x_test.shape

(13279, 1255)

In [41]:
y_pred = model.predict(x_test)
predictions = [round(value) for value in y_pred]

In [42]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 61.49%
