In [13]:
from tpot import TPOTClassifier
from sklearn.base import TransformerMixin, BaseEstimator
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import copy
from tpot.config import classifier_config_dict_light
import fasttext.util
import string
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics import balanced_accuracy_score


In [4]:
df = pd.read_csv('NIC GP data.csv').iloc[:, 4:]
X, y = df.iloc[:, :-1], df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [5]:
df

Unnamed: 0,question1,question2,bot?,number_likes,is_duplicate
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,False,30634,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,False,64192,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,False,1396,0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,False,24049,0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,False,13076,0
...,...,...,...,...,...
1995,"I am visiting Sri Lanka soonfor 9 days, how ca...",Do Indians hate Sri Lankans?,False,24454,0
1996,What are some good examples of 4 stanza poems?,What are some good Ilocano poems?,False,2611,0
1997,Which CPU is better I3 4th Gen or 6th Gen?,Which is better intel i5 (6th gen) or i7 (5th ...,True,18483,0
1998,What are some of the best tourist places to vi...,Where are the foremost tourist places in Chhat...,True,28018,1


# Create several encoders
Since we have text attributes in our dataset, we need to transform them into floating point vectors.
Here I create three encoders:
* TF-IDF
* FastText
* BERT

In [6]:
class QuestionsToFloats(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.model = TfidfVectorizer()

    def fit(self, X, y=None):
        corpus = X.question1.tolist() + X.question2.tolist()
        corpus = list(map(lambda x: x.lower().translate(str.maketrans('', '', string.punctuation)), corpus))
        self.model.fit(corpus)
        
        return self
    
    def transform(self, X):
        q1, q2 = X.question1, X.question2
        q1_vec, q2_vec = self.model.transform(q1), self.model.transform(q2)
        q1_vec, q2_vec = q1_vec.todense(), q2_vec.todense()

        return np.concatenate((q1_vec, q2_vec, 
                               X['bot?'].to_numpy()[..., None], 
                               X['number_likes'].to_numpy()[..., None]), 
                              axis=1)

In [7]:
fasttext.util.download_model('en')  # English
fasttext_model = fasttext.load_model('cc.en.300.bin')

File exists. Use --overwrite to download anyway.




In [8]:
class FastTextEncoding(TransformerMixin, BaseEstimator):      
    def fit(self, X, y=None):       
        return self
    
    def _helper(self, text):
        words = text.split()
        words = list(map(lambda x: x.lower().translate(str.maketrans('', '', string.punctuation)), words))
        words = list(map(fasttext_model.get_word_vector, words))
        
        return np.mean(words, axis=0)
    
    def transform(self, X):
        q1, q2 = X.question1, X.question2
        q1_vec = q1.apply(self._helper)
        q1_vec = pd.DataFrame.from_dict(dict(zip(q1_vec.index, q1_vec.values)))
        q2_vec = q2.apply(self._helper)
        q2_vec = pd.DataFrame.from_dict(dict(zip(q2_vec.index, q2_vec.values)))

        return np.concatenate((q1_vec.T.to_numpy(), q2_vec.T.to_numpy(), 
                               X['bot?'].to_numpy()[..., None], 
                               X['number_likes'].to_numpy()[..., None]), 
                              axis=1)

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()
model = model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
# parts of code are taken from here: http://mccormickml.com/2019/07/22/BERT-fine-tuning/#32-required-formatting
# and here: https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#1-loading-pre-trained-bert

class BertEncoding(TransformerMixin, BaseEstimator):      
    def fit(self, X, y=None):       
        return self
    
    def _helper(self, text):
        sentences = text.split('.')
        ids = list(map(lambda sent: tokenizer.convert_tokens_to_ids(tokenizer.tokenize('[CLS]' + sent + '[SEP]')), 
                       sentences))
        
        ids = list(map(torch.tensor, ids))
        tokens_tensor = torch.concat(ids).unsqueeze(0).cuda()
        segments_tensors = torch.ones(len(tokens_tensor)).unsqueeze(0).cuda()
        
        with torch.no_grad():

            outputs = model(tokens_tensor, segments_tensors)

            hidden_states = outputs[2]
            
            token_vecs = hidden_states[-2][0]

            sentence_embedding = torch.mean(token_vecs, dim=0)
            
            return sentence_embedding.cpu()
    
    def transform(self, X):
        q1, q2 = X.question1, X.question2
        q1_vec = q1.apply(self._helper)
        q1_vec = pd.DataFrame.from_dict(dict(zip(q1_vec.index, q1_vec.values)))
        q2_vec = q2.apply(self._helper)
        q2_vec = pd.DataFrame.from_dict(dict(zip(q2_vec.index, q2_vec.values)))

        return np.concatenate((q1_vec.T.to_numpy(), q2_vec.T.to_numpy(), 
                               X['bot?'].to_numpy()[..., None], 
                               X['number_likes'].to_numpy()[..., None]), 
                              axis=1)

It is the easiest way to put all encoders into tpot. I have a single Encoder class with parameter 'encoder_type'. This parameter controls which specific encoder will be used for transformation of the dataset. The task of Tpot is to choose the best performing option for encoding.

In [11]:
class Encoder(TransformerMixin, BaseEstimator):
    def __init__(self, encoder_type):
        self.encoder_type = encoder_type
        self.model = None
        
    def fit(self, X, y=None):
        if self.encoder_type == 'qtf':
            self.model = QuestionsToFloats()
        elif self.encoder_type == 'fte':
            self.model = FastTextEncoding()
        elif self.encoder_type == 'be':
            self.model = BertEncoding()
            
        return self.model.fit(X, y)
    
    def transform(self, X):
        return self.model.transform(X)

Add encoders to the tpot config. Tpot is required to choose the best out of three.

In [15]:
# using TPOT config
config = copy.deepcopy(classifier_config_dict_light)
config["__main__.Encoder"] = {'encoder_type': ['qtf', 'fte', 'be']}

In [16]:
pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
                                    random_state=42, verbosity=2,
                                    scoring='balanced_accuracy',
                                    config_dict=config,
                                    periodic_checkpoint_folder='ckpt',
                                    template='Encoder-Selector-Transformer-Classifier')

In [18]:
pipeline_optimizer.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.6339092382386959

Generation 2 - Current best internal CV score: 0.6339092382386959

Generation 3 - Current best internal CV score: 0.6339092382386959

Generation 4 - Current best internal CV score: 0.6339092382386959

Generation 5 - Current best internal CV score: 0.6339092382386959

Best pipeline: GaussianNB(StandardScaler(VarianceThreshold(Encoder(input_matrix, encoder_type=be), threshold=0.0001)))


In [19]:
pipeline_optimizer.export('tpot_exported_pipeline.py')

In [14]:
# %load tpot_exported_pipeline.py
import numpy as np
import pandas as pd
from __main__ import Encoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from tpot.export_utils import set_param_recursive

# # NOTE: Make sure that the outcome column is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
# features = tpot_data.drop('target', axis=1)
# training_features, testing_features, training_target, testing_target = \
#             train_test_split(features, tpot_data['target'], random_state=42)

# Average CV score on the training set was: 0.6339092382386959
exported_pipeline = make_pipeline(
    Encoder(encoder_type="be"),
    VarianceThreshold(threshold=0.0001),
    StandardScaler(),
    GaussianNB()
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(X_train, y_train)
results = exported_pipeline.predict(X_test)
balanced_accuracy_score(y_test, results)

0.6373134328358209