In [1]:
%matplotlib inline
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

import gensim

import scikitplot.plotters as skplt

import nltk

from xgboost import XGBClassifier

import os

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.optimizers import Adam

from IPython.display import display, HTML

Using Theano backend.


# Load & test train text and classes

In [2]:
#train data
df_train_txt = pd.read_csv('../../input/training_text', sep='\|\|', header=None, skiprows=1, names=["ID","Text"])
df_train_var = pd.read_csv('../../input/training_variants')
#test data
df_test_txt = pd.read_csv('../../input/test_text', sep='\|\|', header=None, skiprows=1, names=["ID","Text"])
df_test_var = pd.read_csv('../../input/test_variants')


  
  """


# Visualizing data sample

In [3]:
df_train_txt.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [4]:
df_test_txt.head()

Unnamed: 0,ID,Text
0,0,2. This mutation resulted in a myeloproliferat...
1,1,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,Vascular endothelial growth factor receptor (V...
3,3,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,Abstract Retinoblastoma is a pediatric retina...


In [5]:
df_train_var.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [6]:
df_test_var.head()

Unnamed: 0,ID,Gene,Variation
0,0,ACSL4,R570S
1,1,NAGLU,P521L
2,2,PAH,L333F
3,3,ING1,A148D
4,4,TMEM216,G77A


### Details of each field

In [7]:
print("Train Variant".ljust(15), df_train_var.shape)
print("Train Text".ljust(15), df_train_txt.shape)
print("Test Variant".ljust(15), df_test_var.shape)
print("Test Text".ljust(15), df_test_txt.shape)

Train Variant   (3321, 4)
Train Text      (3321, 2)
Test Variant    (5668, 3)
Test Text       (5668, 2)


We have 3k rows of train data and 5k rows of unlabelled test data

### Knowing more about text field

In [8]:
df_train_txt["Text"].iloc[0]

"Cyclin-dependent kinases (CDKs) regulate a variety of fundamental cellular processes. CDK10 stands out as one of the last orphan CDKs for which no activating cyclin has been identified and no kinase activity revealed. Previous work has shown that CDK10 silencing increases ETS2 (v-ets erythroblastosis virus E26 oncogene homolog 2)-driven activation of the MAPK pathway, which confers tamoxifen resistance to breast cancer cells. The precise mechanisms by which CDK10 modulates ETS2 activity, and more generally the functions of CDK10, remain elusive. Here we demonstrate that CDK10 is a cyclin-dependent kinase by identifying cyclin M as an activating cyclin. Cyclin M, an orphan cyclin, is the product of FAM58A, whose mutations cause STAR syndrome, a human developmental anomaly whose features include toe syndactyly, telecanthus, and anogenital and renal malformations. We show that STAR syndrome-associated cyclin M mutants are unable to interact with CDK10. Cyclin M silencing phenocopies CDK1

In [9]:
print("For training data, there are a total of", len(df_train_var.ID.unique()), "IDs,", end='')
print(len(df_train_var.Gene.unique()), "unique genes,", end='')
print(len(df_train_var.Variation.unique()), "unique variations and ", end='')
print(len(df_train_var.Class.unique()),  "classes")
print(" ")
print("Out of 3221 IDs there are ",len(df_train_txt.Text.unique()), " different texts")

For training data, there are a total of 3321 IDs,264 unique genes,2996 unique variations and 9 classes
 
Out of 3221 IDs there are  1921  different texts


In [10]:
print("The maximum length of text field is " , max(df_train_txt["Text"].apply(lambda x: len(x.split()))), " and the minimum length is ", min(df_train_txt["Text"].apply(lambda x: len(x.split()))))

The maximum length of text field is  76708  and the minimum length is  1


# We now join the variants and text data using the common ID key

In [11]:
train_all = df_train_var.merge(df_train_txt, how="inner", left_on="ID", right_on="ID")
train_all.head()

Unnamed: 0,ID,Gene,Variation,Class,Text
0,0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


In [12]:
Train_txt_counts = df_train_txt["Text"].apply(lambda x: len(x.split()))

In [13]:
Train_txt_counts.describe()

count     3321.000000
mean      9542.505872
std       7845.251814
min          1.000000
25%       4733.000000
50%       6871.000000
75%      11996.000000
max      76708.000000
Name: Text, dtype: float64

### On an average the text length is <h2>9542.

In [None]:
Train_txt_counts.head()