In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns

In [33]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer, TfidfVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import hamming_loss, accuracy_score,precision_score,recall_score,f1_score
from scipy.sparse import hstack
import warnings,json,gzip



def classify_cancer(fn, penalty, alpha):
    '''
    Runs a multilabel classification experiment
    '''
    X,y,labelNames = getFeaturesAndLabelsFine(fn)
    print X.shape, y.shape, len(labelNames)
    # a train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # turn off warnings, usually there are some labels missing in the training set
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        # train a classifier
        print("Training classifier")
        classif = OneVsRestClassifier(SGDClassifier(penalty=penalty, alpha=alpha), n_jobs=-1).fit(X_train, y_train)
    # predict
    y_predicted = classif.predict(X_test)
    # the scores we want to compute
    scorers = [precision_score,recall_score,f1_score]
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        # compute Scores
        metrics = {s.__name__:getSortedMetrics(y_test,y_predicted,labelNames,s) for s in scorers}
    # dump results
    json.dump(metrics,gzip.open("multilabel_classification_metrics.json","wt"))
    hl = hamming_loss(y_test,y_predicted)
    ps = precision_score(y_test, y_predicted, average='samples')
    rs = recall_score(y_test, y_predicted, average='samples')
    return hl, ps, rs

# medicinal database indicators
def indicator_for(row):
    words = ['medline', 'pubmed', 'embase', 'cochrane', 'cochrane library', 'ovid', 'google scholar']
    return [1.0 if row['review_article'] == 1 and word.lower() in row.abstract.lower().split(" ") else 0.0 for word in words]

def indicator_count_for(row):
    words = ['medline', 'pubmed', 'embase', 'cochrane', 'cochrane library', 'ovid', 'google scholar']
    return np.sum([1.0 if row['review_article'] == 1 and word.lower() in row.abstract.lower().split(" ") else 0.0 for word in words])

def one_hot_database_indicators_from(df):
    return OneHotEncoder().fit_transform(df.fillna("").apply(indicator_count_for, axis=1).values.reshape(-1, 1))

def getFeatures(fn):
    '''
    Load and vectorize features
    '''
    df = pd.read_csv(fn)
    features = []
    print("Vectorizing title character ngrams")
    titleVectorizer = HashingVectorizer(analyzer="char_wb",ngram_range=(1,4),n_features=2**12)
    features.append(titleVectorizer.fit_transform(df.fulltitle.fillna("")))
    print("Vectorizing keywords")
    # searchquery_terms is already a preprocessing step done by data angels, for direct integration
    # keywords seems to be better
    #features.append(CountVectorizer().fit_transform(df.searchquery_terms.str.replace('[\[\]\'\"]',"")))
    # take original keywords as per search query
    features.append(CountVectorizer().fit_transform(df.keywords.str.replace('[\[\]\'\"]',"")))
    print("Vectorizing authors")
    features.append(HashingVectorizer(n_features=2**12).fit_transform(df.author.fillna("").str.replace('[\[\]\'\"]',"")))
    print("Vectorizing abstracts")
    features.append(HashingVectorizer(n_features=2**12).fit_transform(df.abstract.fillna("").str.replace('[\[\]\'\"]',"")))
    print("Computing medicinal database counts")
    features.append(one_hot_database_indicators_from(df))
    X = hstack(features)
    print("Extracted feature vectors with %d dimensions"%X.shape[-1])
    return X

def getFeaturesAndLabelsFineMapped(fn):
    '''
    TODO
    Load and vectorizer features and fine grained labels (vectorized using MultiLabelBinarizer)
    Before piping through MultiLabelBinarizer, apply mapping to reduce the cardinality of labels
    '''
    print("Reading data")
    df = pd.read_csv(fn)
    # tokenize and binarize cancer classification labels
    print("Vectorizing labels")
    labelVectorizer = MultiLabelBinarizer()
    y = labelVectorizer.fit_transform(df.classifications.str.replace('[\[\]\'\"]',"").apply(tokenizeCancerLabels))
    print("Vectorized %d labels"%y.shape[-1])
    X = getFeatures(fn)
    return X,y,labelVectorizer.classes_

def getFeaturesAndLabelsFine(fn):
    '''
    Load and vectorizer features and fine grained labels (vectorized using MultiLabelBinarizer)
    '''
    print("Reading data")
    df = pd.read_csv(fn)
    # tokenize and binarize cancer classification labels
    print("Vectorizing labels")
    labelVectorizer = MultiLabelBinarizer()
    y = labelVectorizer.fit_transform(df.classifications.str.replace('[\[\]\'\"]',"").apply(tokenizeCancerLabels))
    print("Vectorized %d labels"%y.shape[-1])
    X = getFeatures(fn)
    return X,y,labelVectorizer.classes_

def getFeaturesAndLabelsCoarse(fn):
    '''
    Load and vectorizer features and coarse grained top level labels (vectorized using MultiLabelBinarizer)
    '''
    print("Reading data")
    df = pd.read_csv(fn)
    # tokenize and binarize cancer classification labels
    print("Vectorizing labels")
    labelVectorizer = MultiLabelBinarizer()
    y = labelVectorizer.fit_transform(df.label_top_level.str.replace('[\[\]\'\"]',"").apply(tokenizeCancerLabels))
    print("Vectorized %d labels"%y.shape[-1])
    X = getFeatures(fn)
    return X,y,labelVectorizer.classes_

def getSortedMetrics(true, predicted, labels, scorer):
    '''
    Scores predictions
    '''
    score = scorer(true,predicted,average=None)
    return [(labels[l],score[l]) for l in score.argsort()[::-1]]


def tokenizeCancerLabels(s):
    '''
    Tokenize the label string and remove empty strings
    '''
    return [t for t in s.split(",") if len(t)>0]


In [35]:
# X, y, y_names = getFeaturesAndLabelsFine('../data/master/features/features.csv')
# X.shape, y.shape, y_names.shape
# X.shape

precisions = []
alphas = [1e-3, 1e-2, 1e-1]
for a in alphas:
    _, ps, _, = classify_cancer('../data/master/features/features.csv', penalty='l2', alpha=a)
    print "alpha: {}, p: {}".format(a, ps)
    precisions.append(ps)

Reading data
Vectorizing labels
Vectorized 722 labels
Vectorizing title character ngrams
Vectorizing keywords
Vectorizing authors
Vectorizing abstracts
Computing medicinal database counts
Extracted feature vectors with 12397 dimensions
(45885, 12397) (45885, 722) 722
Training classifier


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


alpha: 0.001, p: 0.310899236444
Reading data
Vectorizing labels
Vectorized 722 labels
Vectorizing title character ngrams
Vectorizing keywords
Vectorizing authors
Vectorizing abstracts


KeyboardInterrupt: 

In [None]:
pd.DataFramepd.DataFrame({'alpha': alphas, 'precision': precisions}).set_index('alpha')
perf.plot()

In [140]:
# X, y, y_names = getFeaturesAndLabelsFine('../data/master/features/features.csv')
# X.shape, y.shape, y_names.shape
# X.shape

classify_cancer('../data/master/features/features.csv')

Reading data
Vectorizing labels
Vectorized 722 labels
Vectorizing title character ngrams
Vectorizing keywords
Vectorizing authors
Vectorizing abstracts
Extracted feature vectors with 12340 dimensions
(45885, 12340) (45885, 722) 722
Training classifier


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


(0.0015226804327012275, 0.13516393045668071, 0.052860741859842367)

In [111]:
def classifications_to_list(s):
    '''
    take original classifications column and put all
    individual classifications into a list of string
    '''
    t = s[1:-1].split("',")
    return map(lambda e: e.replace("'", ""), t)

def translate(classification):
    return "1-{}".format(classification)

def proj_classifications(classifications):
    '''
    mapping from too specialized classifications into most frequent ones
    '''
    return list(set([translate(c) for c in classifications]))

df['listed_classifications'] = df.classifications.apply(classifications_to_list)
df['mapped_classifications'] = df.listed_classifications.apply(proj_classifications)

df.listed_classifications

0                                                      []
1                                                      []
2                                                      []
3                                                      []
4                                                      []
5                                                      []
6                                                      []
7                                                      []
8                                                      []
9                                                      []
10                                                     []
11                                                     []
12                                                     []
13                                                     []
14                                                     []
15                                                     []
16                                                     []
17            

In [34]:
X, y, y_names = getFeaturesAndLabelsFine('../data/master/features/features.csv')
X.shape, y.shape, y_names.shape

Reading data
Vectorizing labels
Vectorized 722 labels
Vectorizing title character ngrams
Vectorizing keywords
Vectorizing authors
Vectorizing abstracts
Extracted feature vectors with 12323 dimensions


((6056, 12323), (45885, 722), (722,))

array(['1', '1-alk', '1-alter', '1-betel', '1-bew', '1-blutgr', '1-canna',
       '1-diab', '1-diaet', '1-distress', '1-egcg', '1-erk', '1-erk-fette',
       '1-erk-fettstoffw', '1-erk-reflux', '1-erk-reflux-barrett',
       '1-erk-transpl', '1-ern', '1-ern-karz', '1-gen', '1-gew', '1-horm',
       '1-horm-ss', '1-horm-stillen', '1-implant', '1-infekt',
       '1-infekt-HIV', '1-infekt-hep', '1-infekt-hpv',
       '1-infekt-hpv-impfung', '1-infekt-impfung', '1-infekt-tep', '1-kam',
       '1-kam-bio', '1-karz', '1-karz-pest', '1-karz-str', '1-koerper',
       '1-lebensstil', '1-lq', '1-makro', '1-makro-fett', '1-med',
       '1-mikro', '1-mikro-eisen', '1-mikro-vit-d', '1-nm', '1-nm-fleisch',
       '1-op', '1-op-lk', '1-opi', '1-po', '1-ps', '1-rauch', '1-risiko',
       '1-schichtarbeit', '1-sex', '1-str', '1-stw', '1-umwelt', '1-vit-d',
       '1-zirk', '2', '2-arten', '2-arten-hintergrund', '2-arten-urethra',
       '2-dd', '2-def', '2-entw', '2-entw-Krebsentwicklung',
       '2-en

In [None]:
ytest.shape

In [83]:
def split_classes(s):
    x = map(lambda t: t.replace("'",""), s.split("',"))
    return x

a = df[df.useful == 1.0].classifications.str.replace('\[\]',"").apply(split_classes)

In [99]:
def aa(s):
    t = s[1:-1].split("',")
    return map(lambda s: "{}'".format(s), t)
df[df.useful == 1.0].classifications.apply(aa)

1955    ['hoden,4-frueh', 'hoden,4-rez', 'hoden,4-str-...
1956    ['hoden,3-roe', 'hoden,3-mrt', 'hoden,3-pet', ...
1957    ['hoden,4-frueh', 'hoden,4-rez', 'hoden,4-str-...
1958    ['hoden,4-frueh', 'hoden,4-rez', 'hoden,2-sy',...
1959                ['hoden,3-unters', 'hoden,3-screen'']
1960    ['hoden,5-lu', 'hoden,5-kard', 'hoden,5-ns', '...
1961                                     ['hoden,1-erk'']
1962                                     ['hoden,1-gew'']
1963    ['hoden,4-rez', 'hoden,4-str-nw', 'hoden,4-str...
1964          ['hoden,4-frueh', 'hoden,4-med-adj-frueh'']
1965                                     ['hoden,3-mrt'']
1966                                    ['hoden,5-horm'']
1967                      ['hoden,4-op', 'hoden,2-entw'']
1968                                    ['hoden,1-karz'']
1969                                   ['hoden,4-frueh'']
1970                                     ['hoden,5-ohr'']
1971                                     ['hoden,4-med'']
1972          

In [152]:
pd.read_csv('../data/master/features/features.csv').keywords

0                ['Quelle,KRK,leitlinie,handsuche,201306']
1                ['Quelle,KRK,leitlinie,handsuche,201306']
2                    ['Quelle,Bewegung,,handsuche,201406']
3        ['Quelle,Leukaemien und Lymphome,,handsuche,20...
4                ['Quelle,KRK,leitlinie,handsuche,201306']
5              ['Quelle,Mamma,leitlinie,handsuche,201307']
6                                                       []
7           ['Quelle,Prostata,leitlinie,handsuche,201306']
8                ['Quelle,KRK,leitlinie,handsuche,201306']
9                ['Quelle,KRK,leitlinie,handsuche,201306']
10                      ['Quelle,Lunge,,handsuche,201312']
11                      ['Quelle,Lunge,,handsuche,201312']
12             ['Quelle,Mamma,leitlinie,handsuche,201307']
13              ['Quelle,Ovar,leitlinie,handsuche,201311']
14                   ['Quelle,Bewegung,,handsuche,201311']
15               ['Quelle,KRK,leitlinie,handsuche,201306']
16                   ['Quelle,Therapie,,handsuche,201402

In [162]:
df[['searchquery_terms', 'keywords']]

Unnamed: 0,searchquery_terms,keywords
0,['Hoden'],"['Quelle,Hoden,ovid,systematisch,201510']"
1,['Hoden'],"['Quelle,Hoden,ovid,systematisch,201510']"
2,['Hoden'],"['Quelle,Hoden,ovid,systematisch,201510']"
3,"['Hoden','Harnwege']","['Quelle,Harnwege,ovid,systematisch,201408','Q..."
4,['Hoden'],"['Quelle,Hoden,ebsco,systematisch,201510']"
5,['Hoden'],"['Quelle,Hoden,ebsco,systematisch,201510']"
6,"['Niere','Prostata','Hoden','Harnwege']","['Quelle,Harnwege,ovid,systematisch,201408','Q..."
7,['Hoden'],"['Quelle,Hoden,ovid,systematisch,201510']"
8,['Hoden'],"['Quelle,Hoden,ovid,systematisch,201510']"
9,['Hoden'],"['Quelle,Hoden,ovid,systematisch,201510']"


In [170]:
pd.set_option('display.max_colwidth', -1)
df.keywords.str.replace('[\[\]\'\"]',"").apply(lambda r: r)

0       Quelle,Hoden,ovid,systematisch,201510                                                                                                                                                                                                                                                                                                                                                                 
1       Quelle,Hoden,ovid,systematisch,201510                                                                                                                                                                                                                                                                                                                                                                 
2       Quelle,Hoden,ovid,systematisch,201510                                                                                                                                                             

In [176]:
CountVectorizer().fit_transform(df.keywords.str.replace('[\[\]\'\"]',"").apply(tokenizeCancerLabels))

AttributeError: 'list' object has no attribute 'lower'