In [249]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn import tree
from sklearn.metrics import log_loss, accuracy_score
from sklearn.decomposition import TruncatedSVD
import xgboost, lightgbm
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier

In [2]:
train_txt = pd.read_csv("training_text",sep='\|\|', header=None, skiprows=1, names=["ID","Text"])
test_txt = pd.read_csv("test_text",sep='\|\|', header=None, names=["ID","Text"])
train_var = pd.read_csv("training_variants",sep=',')
test_var = pd.read_csv("test_variants",sep=',',header=None,names=['ID', 'Gene', 'Variation', 'Class'])

  """Entry point for launching an IPython kernel.
  


In [3]:
print('train_txt',train_txt.shape)
print('test_txt',test_txt.shape)
print('train_var',train_var.shape)
print('test_var',test_var.shape)

train_txt (3321, 2)
test_txt (368, 2)
train_var (3321, 4)
test_var (368, 4)


In [4]:
train_set = pd.merge(train_var,train_txt,how='inner',on='ID')
train_set = train_set[['ID', 'Gene', 'Variation', 'Text', 'Class']]
train_set.dropna(inplace=True)
train_set.reset_index(inplace=True)
# train_set.drop(columns=['ID','index'],axis=1,inplace=True)
train_set.drop(columns=['index'],axis=1,inplace=True)
print(train_set.shape)
train_set.head(5)

(3316, 5)


Unnamed: 0,ID,Gene,Variation,Text,Class
0,0,FAM58A,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...,1
1,1,CBL,W802*,Abstract Background Non-small cell lung canc...,2
2,2,CBL,Q249E,Abstract Background Non-small cell lung canc...,2
3,3,CBL,N454D,Recent evidence has demonstrated that acquired...,3
4,4,CBL,L399V,Oncogenic mutations in the monomeric Casitas B...,4


In [5]:
test_set = pd.merge(test_var,test_txt,how='inner',on='ID')
test_set = test_set[['ID', 'Gene', 'Variation', 'Text', 'Class']]
test_set.dropna(inplace=True)
test_set.reset_index(inplace=True)
# test_set.drop(columns=['ID','index'],axis=1,inplace=True)
test_set.drop(columns=['index'],axis=1,inplace=True)
print(test_set.shape)
test_set.head(5)

(367, 5)


Unnamed: 0,ID,Gene,Variation,Text,Class
0,0,CBL,H398Q,Oncogenic mutations in the monomeric Casitas B...,4
1,1,CBL,S80N,Abstract Background Non-small cell lung canc...,6
2,2,SHOC2,M173I,Rasopathies are phenotypically similar syndrom...,4
3,3,DICER1,D1709N,Abstract DICER1 plays a critical role in mic...,4
4,4,PTPRT,S492F,The receptor protein tyrosine phosphatase T (P...,1


In [7]:
# train_set.describe(include = 'all')
# test_set.describe(include = 'all')

In [8]:
train_gene = train_set['Gene'].unique()
print('count_train_gene =',len(train_gene))
test_gene = test_set['Gene'].unique()
print('count_test_gene =',len(test_gene))

count_train_gene = 262
count_test_gene = 139


In [9]:
train_variation = train_set['Variation'].unique()
print('count_train_variation =',len(train_variation))
test_variation = test_set['Variation'].unique()
print('count_test_variation =',len(test_variation))

count_train_variation = 2993
count_test_variation = 328


In [10]:
def get_char_info(text):
    txt_split = re.split('[0-9]*',text)
    txt_split = [i for i in txt_split if i != '']
    if len(txt_split) == 0: first_char, second_char, third_char = '','',''
    elif len(txt_split) == 1: first_char, second_char, third_char = txt_split[0],'',''
    elif len(txt_split) == 2: first_char, second_char, third_char = txt_split[0],txt_split[1],''
    elif len(txt_split) == 3: first_char, second_char, third_char = txt_split[0],txt_split[1],txt_split[2]
    return first_char, second_char, third_char

In [11]:
def get_num_info(text):
    txt_split = re.split('[A-Z]*',text)
    txt_split = [i for i in txt_split if i != '']
    if len(txt_split) == 0: first_num, second_num, third_num = 0,0,0
    elif len(txt_split) == 1: first_num, second_num, third_num = int(txt_split[0]),0,0
    elif len(txt_split) == 2: first_num, second_num, third_num = int(txt_split[0]),int(txt_split[1]),0
    elif len(txt_split) == 3: first_num, second_num, third_num = int(txt_split[0]),int(txt_split[1]),int(txt_split[2])
    return first_num, second_num, third_num

In [12]:
def gene_features(gene):
    first_char, second_char, third_char = get_char_info(gene)
    first_num, second_num, third_num = get_num_info(gene)
    len_char1 = len(first_char)
    len_char2 = len(second_char)
    len_char3 = len(third_char)
    if len_char1 > 0 : gene_char1 = first_char[0]
    else: gene_char1 = ' '
    if len_char2 > 0 : gene_char2 = second_char[0]
    else: gene_char2 = ' '
    if len_char3 > 0 : gene_char3 = third_char[0]
    else: gene_char3 = ' '
    return gene_char1, len_char1, gene_char2, len_char2, gene_char3, len_char3, first_num, second_num, third_num

In [13]:
# extract pure numbers
def extract_num_frm_text(text):
    m = re.search(' [0-9]* ',text)
    if m is not None:
        number = int(m.group())
        sp = text.split(m.group())
        other = ' '.join(sp)
    else:
        number = 0
        other = text
    return number, other

In [14]:
#only text 
def extract_pure_text_frm_text(text):
    txt_splits = re.split(' ',text)
    pure,non_pure = [],[]
    for txt in txt_splits:
        m = re.search('[A-Z][a-z]*|[a-z/]*',txt)
        if m is not None and m.group() == txt:
            pure.append(txt)
        else: non_pure.append(txt)
    pure_text = ' '.join(pure)
    non_pure_text = ''.join(non_pure)
    return pure_text,non_pure_text

In [15]:
def extract_category_combination_from_text(text):
    rep = re.sub('\_|\?|\'|\*|\-','',text)
    cat = ''.join(re.split('[A-Z0-9]*',rep))
    comb = ''.join(re.split('[a-z]*',rep))
    return cat,comb

In [16]:
def gene_and_variations_features(df):
    for i in range(len(df)):
        variation = df.loc[i,'Variation']
        gene = train_set.loc[i,'Gene']
        number, other = extract_num_frm_text(variation)
        pure_text, non_pure_text = extract_pure_text_frm_text(other)
        cat, comb = extract_category_combination_from_text(non_pure_text)
        v_char1,v_len1,v_char2,v_len2,v_char3,v_len3,v_num1,v_num2,v_num3 = gene_features(comb)
        gene = re.sub('\_|\?|\'|\*|\-','',gene)
        g_char1,g_len1,g_char2,g_len2,g_char3,g_len3,g_num1,g_num2,g_num3 = gene_features(gene)
        df.loc[i,'var_number'] = number
        df.loc[i,'var_pure_text'] = pure_text
        df.loc[i,'var_category'] = cat
        df.loc[i,'var_combination'] = comb
        df.loc[i,'var_comb_char1'] = v_char1
        df.loc[i,'var_comb_char1_len'] = v_len1
        df.loc[i,'var_comb_char2'] = v_char2
        df.loc[i,'var_comb_char2_len'] = v_len2
        df.loc[i,'var_comb_char3'] = v_char3
        df.loc[i,'var_comb_char3_len'] = v_len3
        df.loc[i,'var_comb_num1'] = v_num1
        df.loc[i,'var_comb_num2'] = v_num2
        df.loc[i,'var_comb_num3'] = v_num3
        df.loc[i,'gene_char1'] = g_char1
        df.loc[i,'gene_char1_len'] = g_len1
        df.loc[i,'gene_char2'] = g_char2
        df.loc[i,'gene_char2_len'] = g_len2
        df.loc[i,'gene_char3'] = g_char3
        df.loc[i,'gene_char3_len'] = g_len3
        df.loc[i,'gene_num1'] = g_num1
        df.loc[i,'gene_num2'] = g_num2
        df.loc[i,'gene_num3'] = g_num3
    return df

In [17]:
train_set.drop(columns='Text',inplace=True)
test_set.drop(columns='Text',inplace=True)
print(train_set.shape)
print(test_set.shape)

(3316, 4)
(367, 4)


In [19]:
train_set_mod = gene_and_variations_features(train_set)
test_set_mod = gene_and_variations_features(test_set)

  return _compile(pattern, flags).split(string, maxsplit)


In [20]:
train_set_new = train_set_mod.drop(columns=['Gene', 'Variation','var_combination'])
test_set_new = test_set_mod.drop(columns=['Gene', 'Variation', 'var_combination'])

In [21]:
train_set_new.columns

Index(['ID', 'Class', 'var_number', 'var_pure_text', 'var_category',
       'var_comb_char1', 'var_comb_char1_len', 'var_comb_char2',
       'var_comb_char2_len', 'var_comb_char3', 'var_comb_char3_len',
       'var_comb_num1', 'var_comb_num2', 'var_comb_num3', 'gene_char1',
       'gene_char1_len', 'gene_char2', 'gene_char2_len', 'gene_char3',
       'gene_char3_len', 'gene_num1', 'gene_num2', 'gene_num3'],
      dtype='object')

In [22]:
# train_set_new.describe(include='all')
cat_columns = ['var_category', 'var_comb_char1', 'var_comb_char2', 'var_comb_char3', 
               'gene_char1', 'gene_char2', 'gene_char3']
print(train_set_new.shape)

(3316, 23)


In [23]:
for col in cat_columns:
    cat_features = train_set_new[col]
    test_cat_features = train_set_new[col]
    enc = LabelEncoder()
    new_cat_features = enc.fit_transform(cat_features)
    new_cat_features = new_cat_features.reshape(-1, 1) # Needs to be the correct shape
    test_new_cat_features = enc.fit_transform(test_cat_features)
    test_new_cat_features = test_new_cat_features.reshape(-1, 1) # Needs to be the correct shape
    ohe = OneHotEncoder(sparse=False) #Easier to read
    matrix = ohe.fit_transform(new_cat_features)
    test_matrix = ohe.fit_transform(test_new_cat_features)
    matrix_df = pd.DataFrame(matrix,columns = col+'_'+enc.classes_)
    test_matrix_df = pd.DataFrame(test_matrix,columns = col+'_'+enc.classes_)
    train_set_new = train_set_new.merge(matrix_df, how='inner',left_index=True, right_index=True)
    test_set_new = test_set_new.merge(test_matrix_df, how='inner',left_index=True, right_index=True)
    print(col,len(enc.classes_))
    print(train_set_new.shape)
    print(test_set_new.shape)

var_category 13
(3316, 36)
(367, 36)
var_comb_char1 24
(3316, 60)
(367, 60)
var_comb_char2 25
(3316, 85)
(367, 85)
var_comb_char3 21
(3316, 106)
(367, 106)
gene_char1 23
(3316, 129)
(367, 129)
gene_char2 11
(3316, 140)
(367, 140)
gene_char3 3
(3316, 143)
(367, 143)


In [24]:
train_set_new = train_set_new.drop(columns=cat_columns)
test_set_new = test_set_new.drop(columns=cat_columns)
print(train_set_new.shape)
print(test_set_new.shape)

(3316, 136)
(367, 136)


In [32]:
train_set_new = pd.merge(train_set_new,train_txt,how='inner',on='ID')
test_set_new = pd.merge(test_set_new,train_txt,how='inner',on='ID')
# train_set_new.to_csv('mod_train.csv',index=False)
# test_set_new.to_csv('mod_test.csv',index=False)

In [34]:
print(train_set_new.shape)
print(test_set_new.shape)
train_set_new.drop(columns='ID',inplace=True)
test_set_new.drop(columns='ID',inplace=True)
print(train_set_new.shape)
print(test_set_new.shape)

(3316, 137)
(367, 137)
(3316, 136)
(367, 136)


In [35]:
import nltk
from nltk.corpus import stopwords
newstopwords=stopwords.words("English")
WNlemma = nltk.WordNetLemmatizer()

def pre_process(text):
    tokens = nltk.word_tokenize(text)
    tokens=[WNlemma.lemmatize(t) for t in tokens]
    tokens=[word for word in tokens if word not in newstopwords]
    text_after_process=" ".join(tokens)
    return(text_after_process)

In [36]:
train_set_new['Text'] = train_set_new['Text'].apply(pre_process)
test_set_new['Text'] = test_set_new['Text'].apply(pre_process)

In [54]:
train_set_new['var_pure_text'] = train_set_new['var_pure_text'].apply(pre_process)
test_set_new['var_pure_text'] = test_set_new['var_pure_text'].apply(pre_process)

In [81]:
cols_x = [col for col in train_set_new.columns if col != 'Class']
len(cols_x)

135

In [45]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [203]:
#split the data into training and testing
X_train, X_valid, y_train, y_valid = train_test_split(train_set_new[cols_x], train_set_new.Class, 
                                                      test_size=0.2, random_state=12,
                                                      stratify=train_set_new.Class)

In [204]:
cols_xx = [col for col in train_set_new.columns if col not in  ['Class','Text','var_pure_text']]
len(cols_xx)

133

In [215]:
# Text
count_vect_text = CountVectorizer()
X_train_counts = count_vect_text.fit_transform(X_train.Text)
print(X_train_counts.shape)
tfidf_transformer_text = TfidfTransformer()
X_train_tfidf = tfidf_transformer_text.fit_transform(X_train_counts)
print(X_train_tfidf.shape)
text_svd = TruncatedSVD(n_components=100, n_iter=25, random_state=12)
X_train_tfidf_trun = text_svd.fit_transform(X_train_tfidf)
print(X_train_tfidf_trun.shape)

(2652, 150158)
(2652, 150158)
(2652, 100)


In [216]:
X_valid_counts = count_vect_text.transform(X_valid.Text)
X_valid_tfidf = tfidf_transformer_text.transform(X_valid_counts)
print(X_valid_tfidf.shape)
X_valid_tfidf_trun = text_svd.transform(X_valid_tfidf)
print(X_valid_tfidf_trun.shape)

(664, 150158)
(664, 100)


In [217]:
# var_pure_text
count_vect_var = CountVectorizer()
X_var_train_counts = count_vect_var.fit_transform(X_train.var_pure_text)
print(X_var_train_counts.shape)
tfidf_transformer_var = TfidfTransformer()
X_var_train_tfidf = tfidf_transformer_var.fit_transform(X_var_train_counts)
print(X_var_train_tfidf.shape)
# var_svd = TruncatedSVD(n_components=25, n_iter=25, random_state=12)
# X_var_train_tfidf_trun = var_svd.fit_transform(X_var_train_tfidf)
# print(X_var_train_tfidf_trun.shape)

(2652, 26)
(2652, 26)


In [218]:
X_var_valid_counts = count_vect_var.transform(X_valid.var_pure_text)
X_var_valid_tfidf = tfidf_transformer_var.transform(X_var_valid_counts)
print(X_var_valid_tfidf.shape)
# X_var_valid_tfidf_trun = var_svd.transform(X_var_valid_tfidf)
# print(X_var_valid_tfidf_trun.shape)

(664, 26)


In [256]:
X_train_new = hstack((np.array(X_train[cols_xx])
                      ,X_var_train_tfidf
#                       ,X_train_tfidf_trun
                     ))
X_valid_new = hstack((np.array(X_valid[cols_xx])
                      ,X_var_valid_tfidf
#                       ,X_valid_tfidf_trun
                     ))

In [257]:
print(X_valid_new.shape)
print(X_train_new.shape)

(664, 159)
(2652, 159)


In [258]:
# from sklearn.naive_bayes import MultinomialNB
# clf = MultinomialNB().fit(X_train_new, y_train)
# y_pred = clf.predict(X_valid_new)
# print(metrics.confusion_matrix(y_valid, y_pred))
# print(metrics.classification_report(y_valid, y_pred))

In [259]:
# from sklearn import tree
# clf = tree.DecisionTreeClassifier().fit(X_train_new, y_train)
# y_pred = clf.predict(X_valid_new)
# print(metrics.confusion_matrix(y_valid, y_pred))
# print(metrics.classification_report(y_valid, y_pred))

In [260]:
# from sklearn import svm
# from sklearn.svm import SVC
# clf = svm.LinearSVC(C=1.0).fit(X_train_new, y_train)
# y_pred = clf.predict(X_valid_new)
# print(metrics.confusion_matrix(y_valid, y_pred))
# print(metrics.classification_report(y_valid, y_pred))

In [261]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=300, max_depth=50, max_features=100, random_state=0)
clf.fit(X_train_new, y_train)
y_pred = clf.predict(X_valid_new)
y_prob = clf.predict_proba(X_valid_new)
print(accuracy_score(y_valid, y_pred))
print(log_loss(y_valid, y_prob))

0.5993975903614458
1.870137088947707


In [225]:
# from sklearn.ensemble import GradientBoostingClassifier
# clf = GradientBoostingClassifier(n_estimators=300, max_depth=20, max_features=50, random_state=0)
# clf.fit(X_train_new, y_train)
# y_pred = clf.predict(X_valid_new)
# print(metrics.confusion_matrix(y_valid, y_pred))
# print(metrics.classification_report(y_valid, y_pred))

In [226]:
clf = XGBClassifier(max_depth=8, min_child_weight=3, subsample=0.9, colsample_bytree=0.6, learning_rate=0.1, n_estimators=100, random_state=0)
clf.fit(X_train_new, y_train)
y_pred = clf.predict(X_valid_new)
y_prob = clf.predict_proba(X_valid_new)
print(accuracy_score(y_valid, y_pred))
print(log_loss(y_valid, y_prob))

[[ 73   3   1  25   6   1   4   0   0]
 [  5  46   0   0   2   2  36   0   0]
 [  0   0   9   4   0   0   5   0   0]
 [ 20   1   2 101   5   1   7   0   0]
 [  6   2   2   1  25   3   9   0   0]
 [  3   1   0   6   2  34   9   0   0]
 [  2  14   2   5   0   1 167   0   0]
 [  0   0   0   1   0   0   3   0   0]
 [  0   0   0   1   0   0   3   0   3]]
             precision    recall  f1-score   support

          1       0.67      0.65      0.66       113
          2       0.69      0.51      0.58        91
          3       0.56      0.50      0.53        18
          4       0.70      0.74      0.72       137
          5       0.62      0.52      0.57        48
          6       0.81      0.62      0.70        55
          7       0.69      0.87      0.77       191
          8       0.00      0.00      0.00         4
          9       1.00      0.43      0.60         7

avg / total       0.69      0.69      0.68       664



  if diff:
  'precision', 'predicted', average, warn_for)


In [227]:
clf = LGBMClassifier(max_depth=20, min_child_weight=3, subsample=0.9, colsample_bytree=0.6, learning_rate=0.1, n_estimators=200, random_state=0)
clf.fit(X_train_new, y_train)
y_pred = clf.predict(X_valid_new)
y_prob = clf.predict_proba(X_valid_new)
print(accuracy_score(y_valid, y_pred))
print(log_loss(y_valid, y_prob))

[[ 72   3   1  26   6   2   3   0   0]
 [  5  47   0   0   2   2  35   0   0]
 [  1   0   9   4   0   0   4   0   0]
 [ 18   0   3 103   4   0   9   0   0]
 [  8   3   2   2  20   5   8   0   0]
 [  6   2   0   5   0  35   7   0   0]
 [  2  16   3   3   0   2 165   0   0]
 [  0   0   0   1   0   0   3   0   0]
 [  0   0   0   2   0   0   1   0   4]]
             precision    recall  f1-score   support

          1       0.64      0.64      0.64       113
          2       0.66      0.52      0.58        91
          3       0.50      0.50      0.50        18
          4       0.71      0.75      0.73       137
          5       0.62      0.42      0.50        48
          6       0.76      0.64      0.69        55
          7       0.70      0.86      0.77       191
          8       0.00      0.00      0.00         4
          9       1.00      0.57      0.73         7

avg / total       0.68      0.69      0.68       664



  if diff:
  'precision', 'predicted', average, warn_for)
