# Capstone Project: Classifying clinically actionable genetic mutations

***

# Step 3: Modelling

## Importing of Libraries

In [1]:
# pip install imblearn

In [2]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

from sklearn.utils import resample

from imblearn.over_sampling import SMOTE

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

import nltk
from nltk.tokenize import RegexpTokenizer
import regex as re
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

from collections import Counter
from wordcloud import WordCloud

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

  import pandas.util.testing as tm


## Data Import

In [3]:
# import 'train_prep' and 'test_prep' datasets
train = pd.read_csv("../datasets/train_prep.csv", keep_default_na=False)
test = pd.read_csv("../datasets/test_prep.csv", keep_default_na=False)

In [4]:
train.shape, test.shape

((3321, 5), (5668, 4))

In [5]:
train.head()

Unnamed: 0,id,gene,variation,class,text
0,0,FAM58A,Truncating Mutations,1,cyclin dependent kinase cdks regulate variety ...
1,1,CBL,W802*,2,abstract background non small lung cancer nscl...
2,2,CBL,Q249E,2,abstract background non small lung cancer nscl...
3,3,CBL,N454D,3,recent evidence ha demonstrated acquired unipa...
4,4,CBL,L399V,4,oncogenic monomeric casitas b lineage lymphoma...


In [6]:
test.head()

Unnamed: 0,id,gene,variation,text
0,0,ACSL4,R570S,resulted myeloproliferative phenotype includin...
1,1,NAGLU,P521L,abstract large suppressor lat serine threonine...
2,2,PAH,L333F,vascular endothelial growth factor receptor ve...
3,3,ING1,A148D,inflammatory myofibroblastic imt neoplasm typi...
4,4,TMEM216,G77A,abstract retinoblastoma pediatric retinal init...


## Splitting of data into Predictor (X) and Target (y) Dataframes

In [7]:
X = train[[i for i in train.columns if i not in ['class']]]
y = train['class']

In [8]:
X.shape, y.shape

((3321, 4), (3321,))

## Creation of (Inner) Training and Validation Datasets

From our single training data set (X and y) we will create two separate datasets:
- (Inner) Training Dataset: this will be used to train our models (this will take 75% of the original training dataset)
- Validation Dataset: this will be used to validate our trained models (e.g. check for overfitting) (this will take 25% of our total 'posts' dataset

To create our datasets, we use train_test_split with the stratify option to ensure a consistent mix of values for the target feature within the created datasets.

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, stratify=y)

In [10]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((2490, 4), (2490,), (831, 4), (831,))

In [11]:
X_train.reset_index(inplace=True)
X_val.reset_index(inplace=True)

## Tokenisation using CountVectorizer

The CountVectorizer provides a simple way to tokenise a collection of text documents, build a vocabulary of known words and  encode new documents using that vocabulary. CountVectorizer will transform the lists of text into features that we can pass into a model. It will create columns (also knon as vectors), where each column counts how many times each word is observed in each descriptive text string.

In [12]:
# Instantiate a CountVectorizer object
cvec = CountVectorizer(stop_words='english')

In [13]:
%%time
X_train_cvec = cvec.fit_transform(X_train['text'])

Wall time: 8.14 s


In [14]:
X_train_cvec.shape

(2490, 74865)

In [15]:
X_train_cvec_df = pd.DataFrame(X_train_cvec.toarray(), columns=cvec.get_feature_names())

In [16]:
X_train_cvec_df.shape

(2490, 74865)

In [17]:
X_train_dummy = pd.get_dummies(X_train[['gene', 'variation']], drop_first=False)

In [18]:
X_train_dummy.shape

(2490, 2494)

In [19]:
# Concatenate the components parts of the dataframe
X_train = pd.concat([X_train_dummy, X_train_cvec_df], axis=1)

In [20]:
X_train.shape

(2490, 77359)

## Handling of imbalanced classes

In [21]:
y_train.value_counts(normalize=True)

7    0.287149
4    0.206426
1    0.171084
2    0.136145
6    0.082731
5    0.072691
3    0.026908
9    0.011245
8    0.005622
Name: class, dtype: float64

We note above that the **training set is highly imbalanced** -- i.e. classes 4 and 7 alone take up almost 50% of all classes found in the training set.

To deal with this, we will need to oversample the minority classes rather than undersample the majority classes as the latter will remove valuable data for our modelling.

We oversample by creating synthetic samples using imblearn’s SMOTE or Synthetic Minority Oversampling Technique. SMOTE uses a nearest neighbors algorithm to generate new and synthetic data we can use for training our model. We generate new samples **only in the training set** to ensure our model generalises well to unseen data.

In [22]:
# Instantiate a SMOTE object to oversample minority classes
sm = SMOTE(random_state=42) 

In [23]:
%%time
X_train, y_train = sm.fit_sample(X_train, y_train)

Wall time: 1min 57s


In [24]:
X_train.shape, y_train.shape

((6435, 77359), (6435,))

In [25]:
y_train.value_counts()

7    715
3    715
6    715
2    715
9    715
5    715
1    715
8    715
4    715
Name: class, dtype: int64

As shown above, the oversampling has been successful - there are now an equal number of data points for each 'class'.

## Creation of Baseline Model

### Logistic Regression

In [26]:
logreg = LogisticRegression()

In [None]:
logreg_parameters = {
    'penalty':['l2'],
    'solver':['newton-cg', 'sag', 'saga' and 'lbfgs'],
    'multi_class':['multinomial'],
    'C': np.logspace(-5, 0, 5)
}

logreg_rs = RandomizedSearchCV(LogisticRegression(), logreg_parameters, cv=3, verbose=1, n_jobs=-1)
logreg_rs.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
print(logreg_rs.best_score_)
logreg_rs.best_params_

In [None]:
print ("Score on based on training set: {}".\
       format(logreg_rs.score(X_train['text'], y_train)))

### Selection of Baseline Model

### Further exploration of Baseline Model

## Evaluation of Baseline Model based on Testing Dataset

## Creation of Alternative Model based on xxx

### Selection of Alternative Model

### Further exploration of Alternative Model

## Evaluation of Alternative Model based on Testing Dataset

## Conclusion