In [1]:


# Load Libraries
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer



In [2]:
train_text_df = pd.read_csv('training_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])

In [3]:
train_text_df.head(2)

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...


In [4]:
train_text_df.shape

(3321, 2)

In [5]:
train_var_df = pd.read_csv('training_variants', engine='python', header=None, skiprows=1, names=["ID","Gene","Variation","Class"])

In [6]:
train_var_df.head(2)

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2


In [7]:
train_text_df.shape

(3321, 2)

In [8]:
train_df=train_text_df.merge( train_var_df, on='ID', how='inner') #taken inner as on join it includes the common rows between of because either of one is missing information will not be relevant

In [9]:
print train_df.columns

Index([u'ID', u'Text', u'Gene', u'Variation', u'Class'], dtype='object')


In [10]:
train_df.shape

(3321, 5)

In [11]:
test_text_df = pd.read_csv('test_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])

In [12]:
test_text_df.head(2)

Unnamed: 0,ID,Text
0,0,2. This mutation resulted in a myeloproliferat...
1,1,Abstract The Large Tumor Suppressor 1 (LATS1)...


In [13]:
test_text_df.shape

(5668, 2)

In [14]:
test_var_df = pd.read_csv('test_variants', engine='python', header=None, skiprows=1, names=["ID","Gene","Variation"])

In [15]:
#clean  the training text data

In [16]:
#all_text_data = pd.concat([train_text_df, test_text_df]).reset_index(drop=True)

In [17]:
stops = set(stopwords.words("english"))

In [18]:
def removeExtraLetter(text):
    txt=str(text)
    txt=re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt=re.sub(r'\n',r' ',txt)
    return txt

In [None]:
# function to clean data


def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt

In [19]:
# clean description
train_df['Text'] = train_df['Text'].map(lambda x: removeExtraLetter(x))

In [20]:
train_df['Text'].head(2)

0    Cyclindependent kinases CDKs regulate a variet...
1     Abstract Background  Nonsmall cell lung cance...
Name: Text, dtype: object

In [21]:
#change the data into lower case

In [22]:
def changeLowerCase(text):
    txt = " ".join([w.lower() for w in text.split()])
    return txt

In [23]:
# convert into lower case 
train_df['Text'] = train_df['Text'].map(lambda x: changeLowerCase(x))

In [25]:
train_df.head(2)

Unnamed: 0,ID,Text,Gene,Variation,Class
0,0,cyclindependent kinases cdks regulate a variet...,FAM58A,Truncating Mutations,1
1,1,abstract background nonsmall cell lung cancer ...,CBL,W802*,2


In [26]:
#remove stop words
def removeStopWords(text):
    txt = " ".join([w for w in text.split() if w not in stops])
    return txt
    

In [27]:
# remove stop words
train_df['Text'] = train_df['Text'].map(lambda x: removeStopWords(x))

In [28]:
train_df['Text'].head(2)

0    cyclindependent kinases cdks regulate variety ...
1    abstract background nonsmall cell lung cancer ...
Name: Text, dtype: object

In [29]:
def stemmingWords(text):
    st = PorterStemmer()
    txt = " ".join([st.stem(w) for w in text.split()])
    return txt

In [30]:
#stem the words
train_df['Text'] = train_df['Text'].map(lambda x: stemmingWords(x))


In [31]:
train_df['Text'].head(2)

0    cyclindepend kinas cdk regul varieti fundament...
1    abstract background nonsmal cell lung cancer n...
Name: Text, dtype: object

In [33]:
tfidfvec = TfidfVectorizer(analyzer='word', ngram_range = (1,1), min_df = 150, max_features=500)

In [34]:
tfidfdata = tfidfvec.fit_transform(train_df['Text'])

In [35]:
# label encode categorical features in data given
cols = ['Gene','Variation']

for x in cols:
    lbl = LabelEncoder()
    train_df[x] = lbl.fit_transform(train_df[x])

In [36]:
tfidf_df = pd.DataFrame(tfidfdata.todense())

In [37]:
tfidf_df.columns = ['col' + str(x) for x in tfidf_df.columns]

In [38]:
tfidf_df.head(2)

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,...,col490,col491,col492,col493,col494,col495,col496,col497,col498,col499
0,0.048953,0.007516,0.01047,0.0033,0.013955,0.00692,0.02309,0.014054,0.003665,0.00358,...,0.00687,0.01709,0.005241,0.006589,0.00992,0.011413,0.010773,0.046509,0.0,0.021451
1,0.064533,0.03264,0.01732,0.02047,0.012986,0.008586,0.016371,0.013078,0.004547,0.008883,...,0.004262,0.008482,0.0,0.028614,0.004103,0.01416,0.017822,0.034624,0.012184,0.0


In [43]:
#merge the tfif_df with training data
col_name=['ID','Gene','Variation']
train_feat_tfidf = pd.concat([train_df[col_name], tfidf_df], axis=1)

In [44]:
train_feat_tfidf.head(2)

Unnamed: 0,ID,Gene,Variation,col0,col1,col2,col3,col4,col5,col6,...,col490,col491,col492,col493,col494,col495,col496,col497,col498,col499
0,0,85,2629,0.048953,0.007516,0.01047,0.0033,0.013955,0.00692,0.02309,...,0.00687,0.01709,0.005241,0.006589,0.00992,0.011413,0.010773,0.046509,0.0,0.021451
1,1,39,2856,0.064533,0.03264,0.01732,0.02047,0.012986,0.008586,0.016371,...,0.004262,0.008482,0.0,0.028614,0.004103,0.01416,0.017822,0.034624,0.012184,0.0


In [45]:
mod1 = GaussianNB()
target = train_df['Class']

In [55]:
print(cross_val_score(mod1, train_feat_tfidf, target, cv=20, scoring=make_scorer(accuracy_score)))



[ 0.25882353  0.31176471  0.23076923  0.31952663  0.27218935  0.18343195
  0.38690476  0.26190476  0.25149701  0.31927711  0.27108434  0.22289157
  0.33333333  0.44512195  0.5304878   0.39877301  0.41104294  0.35802469
  0.24691358  0.18012422]


In [48]:
import lightgbm as lgb

In [49]:
d_train = lgb.Dataset(train_feat_tfidf, label = target)

In [52]:
params = {'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_error',
    'learning_rate': 0.05, 
    'max_depth': 7, 
    'num_leaves': 21, 
    'feature_fraction': 0.3, 
    'bagging_fraction': 0.8, 
    'bagging_freq': 5}

In [53]:
lgb_cv = lgb.cv(params, d_train, num_boost_round=500, nfold= 5, shuffle=True, stratified=True, verbose_eval=20, early_stopping_rounds=40)

[20]	cv_agg's binary_error: 0 + 0
[40]	cv_agg's binary_error: 0 + 0
