In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer, TfidfVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import hamming_loss, accuracy_score,precision_score,recall_score,f1_score
from scipy.sparse import hstack
import warnings,json,gzip
from sklearn.preprocessing import OneHotEncoder


def classify_cancer(fn, penalty, alpha):
    '''
    Runs a multilabel classification experiment
    '''
    X,y,labelNames = getFeaturesAndLabelsFine(fn)
    print X.shape, y.shape, len(labelNames)
    # a train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # turn off warnings, usually there are some labels missing in the training set
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        # train a classifier
        print("Training classifier")
        classif = OneVsRestClassifier(SGDClassifier(penalty=penalty, alpha=alpha), n_jobs=-1).fit(X_train, y_train)
    # predict
    y_predicted = classif.predict(X_test)
    # the scores we want to compute
    scorers = [precision_score,recall_score,f1_score]
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        # compute Scores
        metrics = {s.__name__:getSortedMetrics(y_test,y_predicted,labelNames,s) for s in scorers}
    # dump results
    json.dump(metrics,gzip.open("multilabel_classification_metrics.json","wt"))
    hl = hamming_loss(y_test,y_predicted)
    ps = precision_score(y_test, y_predicted, average='samples')
    rs = recall_score(y_test, y_predicted, average='samples')
    return hl, ps, rs

# medicinal database indicators
def indicator_for(row):
    words = ['medline', 'pubmed', 'embase', 'cochrane', 'cochrane library', 'ovid', 'google scholar']
    return [1.0 if row['review_article'] == 1 and word.lower() in row.abstract.lower().split(" ") else 0.0 for word in words]

def indicator_count_for(row):
    words = ['medline', 'pubmed', 'embase', 'cochrane', 'cochrane library', 'ovid', 'google scholar']
    return np.sum([1.0 if row['review_article'] == 1 and word.lower() in row.abstract.lower().split(" ") else 0.0 for word in words])

def one_hot_database_indicators_from(df):
    return OneHotEncoder().fit_transform(df.fillna("").apply(indicator_count_for, axis=1).values.reshape(-1, 1))

def getFeatures(fn):
    '''
    Load and vectorize features
    '''
    df = pd.read_csv(fn)
    features = []
    print("Vectorizing title character ngrams")
    titleVectorizer = HashingVectorizer(analyzer="char_wb",ngram_range=(1,4),n_features=2**12)
    features.append(titleVectorizer.fit_transform(df.fulltitle.fillna("")))
    print("Vectorizing keywords")
    # searchquery_terms is already a preprocessing step done by data angels, for direct integration
    # keywords seems to be better
    #features.append(CountVectorizer().fit_transform(df.searchquery_terms.str.replace('[\[\]\'\"]',"")))
    # take original keywords as per search query
    features.append(CountVectorizer().fit_transform(df.keywords.str.replace('[\[\]\'\"]',"")))
    print("Vectorizing authors")
    features.append(HashingVectorizer(n_features=2**12).fit_transform(df.author.fillna("").str.replace('[\[\]\'\"]',"")))
    print("Vectorizing abstracts")
    features.append(HashingVectorizer(n_features=2**12).fit_transform(df.abstract.fillna("").str.replace('[\[\]\'\"]',"")))
    print("Computing medicinal database counts")
    features.append(one_hot_database_indicators_from(df))
    X = hstack(features)
    print("Extracted feature vectors with %d dimensions"%X.shape[-1])
    return X

def getFeaturesAndLabelsFineMapped(fn):
    '''
    TODO
    Load and vectorizer features and fine grained labels (vectorized using MultiLabelBinarizer)
    Before piping through MultiLabelBinarizer, apply mapping to reduce the cardinality of labels
    '''
    print("Reading data")
    df = pd.read_csv(fn)
    # tokenize and binarize cancer classification labels
    print("Vectorizing labels")
    labelVectorizer = MultiLabelBinarizer()
    y = labelVectorizer.fit_transform(df.classifications.str.replace('[\[\]\'\"]',"").apply(tokenizeCancerLabels))
    print("Vectorized %d labels"%y.shape[-1])
    X = getFeatures(fn)
    return X,y,labelVectorizer.classes_

def getFeaturesAndLabelsFine(fn):
    '''
    Load and vectorizer features and fine grained labels (vectorized using MultiLabelBinarizer)
    '''
    print("Reading data")
    df = pd.read_csv(fn)
    # tokenize and binarize cancer classification labels
    print("Vectorizing labels")
    labelVectorizer = MultiLabelBinarizer()
    #y = labelVectorizer.fit_transform(df.classifications.str.replace('[\[\]\'\"]',"").apply(tokenizeCancerLabels))

    from cleaning_classification_labels import clean_labels
    y = labelVectorizer.fit_transform(clean_labels.clean_classification(df.classifications.fillna(""), '../data/master/information/translations-labels.csv', 'cleaning_classification_labels/classification_dictionary.csv'))
    print("Vectorized %d labels"%y.shape[-1])
    X = getFeatures(fn)
    return X,y,labelVectorizer.classes_

def getFeaturesAndLabelsCoarse(fn):
    '''
    Load and vectorizer features and coarse grained top level labels (vectorized using MultiLabelBinarizer)
    '''
    print("Reading data")
    df = pd.read_csv(fn)
    # tokenize and binarize cancer classification labels
    print("Vectorizing labels")
    labelVectorizer = MultiLabelBinarizer()
    y = labelVectorizer.fit_transform(df.label_top_level.str.replace('[\[\]\'\"]',"").apply(tokenizeCancerLabels))
    print("Vectorized %d labels"%y.shape[-1])
    X = getFeatures(fn)
    return X,y,labelVectorizer.classes_

def getSortedMetrics(true, predicted, labels, scorer):
    '''
    Scores predictions
    '''
    score = scorer(true,predicted,average=None)
    return [(labels[l],score[l]) for l in score.argsort()[::-1]]


def tokenizeCancerLabels(s):
    '''
    Tokenize the label string and remove empty strings
    '''
    return [t for t in s.split(",") if len(t)>0]


# data cleanup

In [None]:
def correct_data(path="master/features/"):

    def is_hoden_niere(x):
        return "hoden" in x or "niere" in x

    data = pd.read_csv(path+"features.csv")
    correct_label = pd.read_csv(path+"features-hodenniere.csv").drop_duplicates(["pages","fulltitle"])
    c_data = pd.merge(data,correct_label,on=["fulltitle","pages"],how='left')
    c_data["useful"] = data.useful.values
    is_hoden_niere = c_data["cancer_types_x"].apply(is_hoden_niere)
    data["useful"] = c_data.useful.values

    return data


# word stemming

In [None]:
def pre_process_word_stemmer(x,type_x='porter'):

    if type_x not in ['porter','lancaster','snowball']:
        return x
    words = x.split(" ")
    if type_x == 'porter':
        stemmer =  nltk.stem.PorterStemmer()
    elif type_x == 'lancaster':
        stemmer = nltk.stem.LancasterStemmer()
    elif type_x == 'snowball':
        stemmer = nltk.stem.SnowballStemmer(language='english')
    #print words
    return ' '.join([stemmer.stem(w) for w in words])

pre_process_word_stemmer('cancers type full',type_x='porter')

# model selection

In [None]:
precisions = []
alphas = [1e-3, 1e-2, 1e-1]
for a in alphas:
    _, ps, _, = classify_cancer('../data/master/features/features.csv', penalty='l2', alpha=a)
    print "alpha: {}, p: {}".format(a, ps)
    precisions.append(ps)

In [None]:
import pandas as pd
import numpy as np

#path_for_translation_labels = 'cancer_data/information/translations-labels.csv'   #path of the .csv translation file provided by the data ambassadors
#path_for_manual_transtable = 'classification_dictionary.csv'        # path of the .csv file   that was filled manually by Marie

def clean_classification(class_series, trans_path, manual_trans_path):

    brac_free = class_series.apply(tokenizeCancerLabels)
    
    trans_labels_df = pd.read_csv(trans_path,delimiter=';')

    labels1st = np.unique(trans_labels_df['Label (1st Level)'])

    class_trans_df = pd.read_csv(manual_trans_path,delimiter=';',header=None)

    #### create dictionary that translates to correct first level 
    include_1st_level_dict = {'1-bew': '1-koerper', 
                              '1-gew': '1-koerper',
                              '1-gen': '1-koerper', 
                              '1-horm':'1-koerper',
                              '1-rauch': '1-ps', 
                              '1-alk': '1-ps', 
                              '1-canna': '1-ps',
                              '1-diab': '1-erk',
                              '1-infekt': '1-erk', 
                              '3-tu-marker': '3-lab',
                              '3-biops': '3-lab',
                               '5-fr\xc3\xbch': '5'}

    for label in labels1st:
        include_1st_level_dict[label] = label

    for i in range(class_trans_df.shape[0]):
        cc = class_trans_df.iloc[i]        
        include_1st_level_dict[cc[0]] = cc[1]

    ## done creating the dictionary
    cleaned_series = brac_free.apply(lambda x: clean_levels(x,include_1st_level_dict))

    return cleaned_series


def tokenizeCancerLabels(s):
    '''
    Tokenize the label string and remove empty strings
    '''
    ## if string is an empty list return an empty list
    if s == '[]':
        return []

    ## else return list with    bodypart,classify
    s = s.replace("['","").replace("']","")
    return [t for t in s.split("','") if len(t)>0]



def clean_levels(s_list,trans_dict):
    '''
    function that cleans the label
    '''
    new_list = []
    ## check if there is an entry
    if len(s_list) == 0:
        return new_list
        
    for t in s_list:        
        ts = t.split(',')
        if len(ts) > 2:  ## here cancer applies to more than one bodyparts.. sort them alphabetically
            bodystring = ','.join(sorted(map(lambda x: x.lower(),ts[:-1])))        
            old_class = ts[-1]                        
        else:   #otherwise there is only one bodypart
            bodypart = ts[0]
            old_class = ts[1]
            
        ### correct the classification of the label of format  X-str-...
        ocs = old_class.split('-')
        
        if len(ocs) == 1:
            new_class = old_class
            
        elif len(ocs) == 2:
            if trans_dict.has_key(old_class):
                new_class = trans_dict[old_class]
            else:
                new_class = old_class

        elif len(ocs) > 2:
            first = ocs[0] + '-' + ocs[1]            
            first_extended = ocs[0] + '-' + ocs[1] + '-' + ocs[2]  ## need extra check as '3-tu-marker ' should be maped to  '3-labl

            if trans_dict.has_key('-'.join(ocs)):
                new_class = trans_dict['-'.join(ocs)]
            elif trans_dict.has_key(first_extended):
                new_class = trans_dict[first_extended]
            elif trans_dict.has_key(first):
                new_class = trans_dict[first]
            else:
                new_class = old_class
                    
        new_list.append(ts[0] + ',' + new_class)

    return new_list


In [None]:
pd.set_option('display.max_colwidth', -1)
from cleaning_classification_labels import clean_labels
df = pd.read_csv('../data/master/features/features.csv').fillna("")
df['cleaned_classifications'] = clean_labels.clean_classification(df.classifications, '../data/master/information/translations-labels.csv', 'cleaning_classification_labels/classification_dictionary.csv')
df[['cleaned_classifications', 'classifications']].head(150)
# clean_classification(df, '../data/master/information/translations-labels.csv', 'cleaning_classification_labels/classification_dictionary.csv')

In [None]:
pd.DataFramepd.DataFrame({'alpha': alphas, 'precision': precisions}).set_index('alpha')
perf.plot()