In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import spacy
from itertools import chain
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

f1 = './data/training_text'
f2 = './data/training_variants'

df2 = pd.read_csv(f2)



In [2]:
df1 = pd.read_csv(f1, sep='\|\|', names=['ID', 'text'], skiprows=1)
df1.head()

Unnamed: 0,ID,text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [3]:
df2.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [4]:
df = df1.merge(df2)
df.head()

Unnamed: 0,ID,text,Gene,Variation,Class
0,0,Cyclin-dependent kinases (CDKs) regulate a var...,FAM58A,Truncating Mutations,1
1,1,Abstract Background Non-small cell lung canc...,CBL,W802*,2
2,2,Abstract Background Non-small cell lung canc...,CBL,Q249E,2
3,3,Recent evidence has demonstrated that acquired...,CBL,N454D,3
4,4,Oncogenic mutations in the monomeric Casitas B...,CBL,L399V,4


In [12]:
##tokenize and remove stop words
from nltk.tokenize.regexp import RegexpTokenizer
from nltk_trainer.featx.phonetics import soundex

# use nltk to tokenize the data then soundex the tokens
# replace the soundexed tokens in the original data
regex_tokenizer = RegexpTokenizer(r'(?u)\d+(?:\.\d+)?|\w+')
regex_nonnum_tokenizer = RegexpTokenizer(r'[a-zA-Z]+')


def soundex_tokenizer(data_to_tokenize):
    tokenized_data = [' '.join(map(soundex, regex_tokenizer.tokenize(product))) for product in data_to_tokenize]
    return tokenized_data


def word_tokenizer(data_to_tokenize):
    tokenized_data = [' '.join(regex_tokenizer.tokenize(product)) for product in data_to_tokenize]
    return tokenized_data

#df['tokens'] contain tokenized documents
df['tokens'] = word_tokenizer(df.text)

## look at freqcounts of training data and Y

#count of words
all_words = chain.from_iterable([words for rownum, words in df['tokens'].iteritems()])
words = pd.Series(list(all_words)).value_counts()

#count of labels

print(df['Class'].value_counts())

#####create X and Y training data 

#%% X and Y Labels
le = preprocessing.LabelEncoder()
le.fit(df2['Class'])
df2['Class2'] = le.transform(df2['Class'])

Y_train = df2['Class2']

#%%

X_train = df['tokens']
#X2 = df2['tokens']

docs_train, docs_test, labels_train, labels_test = train_test_split(
        X_train, Y_train, test_size=0.1, random_state=42, stratify=Y_train)

7    953
4    686
1    568
2    452
6    275
5    242
3     89
9     37
8     19
Name: Class, dtype: int64


In [13]:
def tok(x):
    return x

def prep(x):
    return x

vectorizer = TfidfVectorizer(tokenizer=tok, preprocessor=prep,
                             ngram_range=(3,3), min_df=2)


check = vectorizer.fit_transform(X_train) 
#check = check.toarray()
feature_names = vectorizer.get_feature_names()
#the curse of dimensionality or the predictors >> no. samples
print("n_samples: %d, n_features: %d" % check.shape)


n_samples: 3321, n_features: 55260


In [14]:
from sklearn.feature_extraction.text import CountVectorizer

cnt_vectorizer = CountVectorizer(tokenizer=tok, preprocessor=prep,ngram_range=(3,3), min_df=2)

In [19]:
# nltk for tokenizing and removing stop words
from sklearn.feature_extraction.text import *
import re

text = re.compile("[A-Za-z]+")

def tok(x):
    return text.findall(x)

def prep(x):
    return text.findall(x)


tftr = TfidfVectorizer()
# tftr.fit_transform(df.text)

<3321x155732 sparse matrix of type '<class 'numpy.float64'>'
	with 5620942 stored elements in Compressed Sparse Row format>

In [21]:
from sklearn.multiclass import OutputCodeClassifier, OneVsRestClassifier, OneVsOneClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics


model = Pipeline([
    ('vectorizer', tftr),
    ('LR',LogisticRegression(multi_class='multinomial', solver='newton-cg'))
])

model.fit(docs_train, labels_train)

labels_predict = model.predict(docs_test)

# proba = model.predict_proba(docs_test)

print("Precision", metrics.precision_score(labels_test, labels_predict, average='macro'))
print("Recall", metrics.recall_score(labels_test, labels_predict, average='micro'))
print("F1-score", metrics.f1_score(labels_test, labels_predict, average='weighted'))
print("F beta score", metrics.fbeta_score(labels_test, labels_predict, beta=0.5, average='macro'))

Precision 0.538594616995
Recall 0.636636636637
F1-score 0.614119959801
F beta score 0.501044370664


In [18]:
# tokenized

model = Pipeline([
            ('vectorizer', vectorizer),
            ('LR',LogisticRegression(multi_class='multinomial', solver='newton-cg'))
            ])

model.fit(docs_train, labels_train)

labels_predict = model.predict(docs_test)

# proba = model.predict_proba(docs_test)

print("Precision", metrics.precision_score(labels_test, labels_predict, average='macro'))
print("Recall", metrics.recall_score(labels_test, labels_predict, average='micro'))
print("F1-score", metrics.f1_score(labels_test, labels_predict, average='weighted'))
print("F beta score", metrics.fbeta_score(labels_test, labels_predict, beta=0.5, average='macro'))

Precision 0.533360286296
Recall 0.630630630631
F1-score 0.605827925462
F beta score 0.49237392823


In [31]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

est = RandomForestClassifier(n_estimators=500)

model = Pipeline([
    ('vectorizer', tftr),
    ('multiclass', MultiOutputClassifier(est))
])

model.fit(docs_train, pd.get_dummies(labels_train))

labels_predict = model.predict(docs_test)

labels_test = pd.get_dummies(labels_test)

print("Precision", metrics.precision_score(labels_test, labels_predict, average='macro'))
print("Recall", metrics.recall_score(labels_test, labels_predict, average='micro'))
print("F1-score", metrics.f1_score(labels_test, labels_predict, average='weighted'))
print("F beta score", metrics.fbeta_score(labels_test, labels_predict, beta=0.5, average='macro'))

Precision 0.642690570182
Recall 0.498498498498
F1-score 0.587145139085
F beta score 0.570798570741


In [32]:
# with tfidf vectoriser

est = RandomForestClassifier(n_estimators=100)

model = Pipeline([
    ('vectorizer', vectorizer),
    ('multiclass', MultiOutputClassifier(est))
])

model.fit(docs_train, pd.get_dummies(labels_train))

labels_predict = model.predict(docs_test)

labels_test = pd.get_dummies(labels_test)

print("Precision", metrics.precision_score(labels_test, labels_predict, average='macro'))
print("Recall", metrics.recall_score(labels_test, labels_predict, average='micro'))
print("F1-score", metrics.f1_score(labels_test, labels_predict, average='weighted'))
print("F beta score", metrics.fbeta_score(labels_test, labels_predict, beta=0.5, average='macro'))

Precision 0.641551584979
Recall 0.489489489489
F1-score 0.581474504482
F beta score 0.58278843881
