In [10]:
import pandas as pd
import numpy as np
import time
import datetime

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import sklearn as sk
from matplotlib import pyplot as plt
from scipy.sparse import csr_matrix
import scipy as sp
import re
import gensim
import logging
import seaborn as sns
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
%matplotlib inline
#import xgboost
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import time
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.corpus import stopwords

train = pd.read_csv("data/features/features.csv",
                    header=0,delimiter=",",quotechar='"',error_bad_lines=False)

print("unfiltered data, num samples: %i with num features: %i" %(train.shape[0],train.shape[1]))

train.fillna(' ',inplace=True)
train.dropna(inplace=True)
print("filtered data with samples: %i" %(train.shape[0]))

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
unfiltered data, num samples: 45885 with num features: 22
filtered data with samples: 45885


In [39]:
list(train)

['num',
 'record-number',
 'filename',
 'pdfismissing',
 'author',
 'fulltitle',
 'subtitle',
 'pages',
 'volume',
 'number',
 'keywords',
 'searchquery_terms',
 'classifications',
 'cancer_types',
 'label_top_level',
 'useful',
 'review_article',
 'not_allowed_in_germany',
 'year',
 'abstract',
 'pub-location',
 'publisher']

In [40]:
def prep_input(train):  
    #y labels
    useful = train["useful"]
    cancer_types_raw = train["cancer_types"]
    cancer_types_raw = [re.sub(r'[^a-z, ]', '', s.lower()) for s in cancer_types_raw.values]
    cancer_types_raw = np.array([s.split(",") for s in cancer_types_raw])

    #features
    abstracts = train['abstract'].values.astype(dtype=str)
    title = train['fulltitle'].values.astype(dtype=str)
    journal = train['subtitle'].values.astype(dtype=str)
    search = train['searchquery_terms'].values.astype(dtype=str)
    print(type(search))

    #only useful examples have a cancer type
    idxthere = np.nonzero(train['useful'] == 1)[0]
    title = title[idxthere]
    journal = journal[idxthere]
    abstracts = abstracts[idxthere]
    useful = useful[idxthere]
    search = search[idxthere]

    cancer_types_raw = cancer_types_raw[idxthere]

    #prepare Y
    mlb = MultiLabelBinarizer()
    cancer_types = mlb.fit_transform(cancer_types_raw)


    #preprocessing
    #one hot encoded journal
    le = sk.preprocessing.LabelEncoder()
    enc = sk.preprocessing.OneHotEncoder()
    le_journal = le.fit_transform(journal)

    le_journal = le_journal.reshape(le_journal.shape[0], 1)
    hot_journal = enc.fit_transform(le_journal) 
    #only keep journals with more than 100 occurences
    journal_count = np.sum(hot_journal, axis=0)
    idx_imp_journals = np.nonzero(journal_count > 20)[0]
    #print(idx_imp_journals.shape)
    imp_journals = hot_journal[:,idx_imp_journals]

    start = time.time()
    #bag of words
    reRemoved = [re.sub(r'[^a-z ]', '', s.lower()) for s in abstracts]
    vect = CountVectorizer(max_df=0.6,min_df=10,stop_words=stopwords.words("english"))
    bow_text = vect.fit_transform(reRemoved)

    reRemoved = [re.sub(r'[^a-z ]', '', s.lower()) for s in title]
    bow_title = vect.fit_transform(reRemoved)

    reRemoved = [re.sub(r'[^a-z ]', '', s.lower()) for s in search]
    bow_search = vect.fit_transform(reRemoved)

    features = sp.sparse.hstack((bow_title, bow_text, imp_journals, bow_search))
    return(features,cancer_types)

In [41]:
features, cancer_types = prep_input(train)
print(features.shape)
print(cancer_types.shape)

<class 'numpy.ndarray'>
(21088, 12605)
(21088, 87)


In [42]:
np.sum(cancer_types, axis = 0)

array([   4,   10,    3,   11,  145,   12,   27,    5,    3, 3384,    1,
          1,   66,   78,  584,  376,  148,  191,  156,  943,   75,  662,
          2,    2,  184,    2,   31, 1634,  848,    6, 1005,  209,    1,
       2613,   31, 1281, 1833,    1,    9,  688,   35,    1,  133,  276,
         15,   49,   70,   74,    7,    2,  116,   19,  750,    1, 1358,
       1023,    6,   14,  940, 1132,   67,   69,    6,   34,   41, 1127,
          1, 2213,    1,    1,    1,   74,   46,  585,  157,   52,    3,
          2,    1,   50,    8,    1,   68,  118,    1,  879,    3])

In [43]:
import xgboost as xgb
# read in data  objective='multi:softmax'
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier

X_train, X_test, y_train, y_test = train_test_split(features, cancer_types, test_size=0.2, random_state=42)
model = OneVsRestClassifier(xgb.XGBClassifier(max_depth=4,n_estimators=200))
model.fit(X_train, y_train)


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


OneVsRestClassifier(estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
          n_jobs=1)

In [44]:
#preds = model.predict_proba(X_test)[:,1]
predicted = model.predict(X_test)

sk.metrics.accuracy_score(y_test, predicted)
#print(preds[:10])
#print(preds.shape)
#print(y_test.shape)
#np.mean(np.power(preds - y_test,2))
#sk.metrics.roc_auc_score(y_test,preds)

0.85182550972024651

In [152]:
pred_labels = mlb.inverse_transform(predicted)
true_labels = mlb.inverse_transform(y_test)
#print(true_labels)
for i in range(0,100):
    print(pred_labels[i])
    print(true_labels[i])
    print("################")


('lunge', 'sclc')
('lunge', 'sclc')
################
('ovar',)
('ovar',)
################
('prostata',)
('prostata',)
################
()
('magen',)
################
('mesotheliom',)
('mesotheliom',)
################
('niere',)
('niere',)
################
()
('tube',)
################
('lunge', 'nsclc')
('lunge', 'nsclc')
################
('prostata',)
('prostata',)
################
('oesophagus',)
('oesophagus',)
################
('krk',)
('krk',)
################
('harn', 'prostata')
('prostata',)
################
('gist',)
('gist',)
################
('prostata',)
('prostata',)
################
('ovar',)
('ovar',)
################
('kopf', 'pharynxlarynx')
('kopf', 'pharynxlarynx')
################
('ovar',)
('ovar',)
################
('kopf', 'pharynxlarynx')
('kopf', 'pharynxlarynx')
################
('pankreas',)
('pankreas',)
################
('galle',)
('pankreas',)
################
()
('basis',)
################
('krk',)
('krk',)
################
('basis',)
('basis',)
#########