https://www.kaggle.com/imoore/60k-stack-overflow-questions-with-quality-rate

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn import metrics

In [2]:
train = pd.read_csv('train.csv')
valid = pd.read_csv('valid.csv')

In [3]:
train

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552656,Java: Repeat Task Every Random Seconds,<p>I'm already familiar with repeating tasks e...,<java><repeat>,2016-01-01 00:21:59,LQ_CLOSE
1,34553034,Why are Java Optionals immutable?,<p>I'd like to understand why Java 8 Optionals...,<java><optional>,2016-01-01 02:03:20,HQ
2,34553174,Text Overlay Image with Darkened Opacity React...,<p>I am attempting to overlay a title over an ...,<javascript><image><overlay><react-native><opa...,2016-01-01 02:48:24,HQ
3,34553318,Why ternary operator in swift is so picky?,"<p>The question is very simple, but I just cou...",<swift><operators><whitespace><ternary-operato...,2016-01-01 03:30:17,HQ
4,34553755,hide/show fab with scale animation,<p>I'm using custom floatingactionmenu. I need...,<android><material-design><floating-action-but...,2016-01-01 05:21:48,HQ
...,...,...,...,...,...,...
44995,60461435,Convert List<String> to string C# - asp.net - ...,<p>I am new to this and I am asking for help t...,<c#><asp.net><sql-server>,2020-02-29 02:22:18,LQ_CLOSE
44996,60461754,Does Python execute code from the top or botto...,<p>I am working on learning Python and was won...,<python>,2020-02-29 03:33:59,LQ_CLOSE
44997,60462001,how to change payment date in Azure?,<p>It looks like it costs 8 days per month in ...,<azure><billing>,2020-02-29 04:34:16,LQ_CLOSE
44998,60465318,how to implement fill in the blank in Swift,"<p>""I _____ any questions.""</p>\n\n<p>I want t...",<ios><swift>,2020-02-29 12:50:43,LQ_CLOSE


In [4]:
train['CreationDate'] = pd.to_datetime(train['CreationDate'])

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Id            45000 non-null  int64         
 1   Title         45000 non-null  object        
 2   Body          45000 non-null  object        
 3   Tags          45000 non-null  object        
 4   CreationDate  45000 non-null  datetime64[ns]
 5   Y             45000 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 2.1+ MB


# Dataset creation for training

In [6]:
train_concat = pd.DataFrame()
train_concat["text"] = train['Title'] + ' ' + train['Body']
train_concat["Y"] = train['Y']

In [9]:
train_concat

Unnamed: 0,text,Y
0,Java: Repeat Task Every Random Seconds <p>I'm ...,LQ_CLOSE
1,Why are Java Optionals immutable? <p>I'd like ...,HQ
2,Text Overlay Image with Darkened Opacity React...,HQ
3,Why ternary operator in swift is so picky? <p>...,HQ
4,hide/show fab with scale animation <p>I'm usin...,HQ
...,...,...
44995,Convert List<String> to string C# - asp.net - ...,LQ_CLOSE
44996,Does Python execute code from the top or botto...,LQ_CLOSE
44997,how to change payment date in Azure? <p>It loo...,LQ_CLOSE
44998,how to implement fill in the blank in Swift <p...,LQ_CLOSE


In [8]:
valid_concat = pd.DataFrame()
valid_concat["text"] = valid['Title'] + ' ' + valid['Body']
valid_concat["Y"] = valid['Y']

# Preprocessing

In [10]:
import string
import nltk
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\utkar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def removeURL(inputText):
    return re.sub(r"http\S+", "", inputText)

In [12]:
def removeHTML(inputText):
    return BeautifulSoup(inputText, "lxml").get_text()

In [13]:
def removePunctuation(inputText):
    return inputText.translate(str.maketrans('', '', string.punctuation))   

In [14]:
stopwords = stopwords.words('english')
def removeStopwords(inputTokens):
    output_tokens = []
    for token in inputTokens:
        if token not in stopwords:
            output_tokens.append(token)
    return output_tokens

In [15]:
def porterStemming(inputTokens):
    porter = PorterStemmer()
    output_tokens = []
    for token in inputTokens:
        output_tokens.append(porter.stem(token))
    return output_tokens

In [16]:
def lemmatisation(inputTokens):
    wnl = WordNetLemmatizer()
    output_tokens = []
    for token in inputTokens:
        output_tokens.append(wnl.lemmatize(token))
    return output_tokens

In [32]:
def preprocess(text):
    text = text.lower()
    text = removeURL(text)
    text = removeHTML(text)
#     text=removePunctuation(text)
    tokens = word_tokenize(text)
#     tokens=removeStopwords(tokens)
#     tokens=lemmatisation(tokens)
#     tokens=porterStemming(tokens)    
    return " ".join(tokens)

In [36]:
# Testing preprocess function on sample text
mytext = "<b>Hey there!</b> log in to https://www.google.com/ .&amp;  Here it is, are you seeing this? Test corpora"

preprocess(mytext)

'hey there ! log in to . & here it is , are you seeing this ? test corpora'

In [39]:
# Preprocess Train and validation data

In [40]:
train_concat['text'] = np.vectorize(preprocess)(train_concat['text'].to_numpy())

In [41]:
train_concat

Unnamed: 0,text,Y
0,java : repeat task every random seconds i 'm a...,LQ_CLOSE
1,why are java optionals immutable ? i 'd like t...,HQ
2,text overlay image with darkened opacity react...,HQ
3,why ternary operator in swift is so picky ? th...,HQ
4,hide/show fab with scale animation i 'm using ...,HQ
...,...,...
44995,convert list to string c # - asp.net - sql ser...,LQ_CLOSE
44996,does python execute code from the top or botto...,LQ_CLOSE
44997,how to change payment date in azure ? it looks...,LQ_CLOSE
44998,how to implement fill in the blank in swift ``...,LQ_CLOSE


In [42]:
valid_concat['text'] = np.vectorize(preprocess)(valid_concat['text'].to_numpy())

In [43]:
valid_concat

Unnamed: 0,text,Y
0,how to get all the child records from differen...,LQ_EDIT
1,retrieve all except some data of the another t...,LQ_EDIT
2,pandas : read_html i 'm trying to extract us s...,HQ
3,"reader always gim me null i 'm so new to c # ,...",LQ_EDIT
4,php rearrange array elements based on conditio...,LQ_EDIT
...,...,...
14995,how can i align two flex boxes to follow each ...,LQ_CLOSE
14996,c++ the correct way to multiply an integer and...,LQ_CLOSE
14997,why django is showing me this error when i try...,LQ_EDIT
14998,php - getting the content of php page i have a...,LQ_CLOSE


#  Feature and Labels

In [130]:
X_train = train_concat['text']
y_train = train_concat['Y']

In [131]:
X_train

0        java : repeat task every random seconds i 'm a...
1        why are java optionals immutable ? i 'd like t...
2        text overlay image with darkened opacity react...
3        why ternary operator in swift is so picky ? th...
4        hide/show fab with scale animation i 'm using ...
                               ...                        
44995    convert list to string c # - asp.net - sql ser...
44996    does python execute code from the top or botto...
44997    how to change payment date in azure ? it looks...
44998    how to implement fill in the blank in swift ``...
44999    how can i make a c # application outside of vi...
Name: text, Length: 45000, dtype: object

In [132]:
y_train

0        LQ_CLOSE
1              HQ
2              HQ
3              HQ
4              HQ
           ...   
44995    LQ_CLOSE
44996    LQ_CLOSE
44997    LQ_CLOSE
44998    LQ_CLOSE
44999    LQ_CLOSE
Name: Y, Length: 45000, dtype: object

In [133]:
X_valid = valid_concat['text']
y_valid = valid_concat['Y']

In [134]:
X_valid

0        how to get all the child records from differen...
1        retrieve all except some data of the another t...
2        pandas : read_html i 'm trying to extract us s...
3        reader always gim me null i 'm so new to c # ,...
4        php rearrange array elements based on conditio...
                               ...                        
14995    how can i align two flex boxes to follow each ...
14996    c++ the correct way to multiply an integer and...
14997    why django is showing me this error when i try...
14998    php - getting the content of php page i have a...
14999    why ca n't overloaded functions vary only by r...
Name: text, Length: 15000, dtype: object

# BERT

In [147]:
import tensorflow as tf
import transformers
from sklearn.metrics import roc_auc_score, confusion_matrix, plot_confusion_matrix, plot_precision_recall_curve,classification_report

In [None]:
# Detect hardware, return appropriate distribution strategy

try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS available: ", strategy.num_replicas_in_sync)

In [136]:
targets = {'HQ': 0, 'LQ_EDIT': 1, 'LQ_CLOSE': 2}
y_train = y_train.map(targets)
y_valid = y_valid.map(targets)
y_train

0        2
1        0
2        0
3        0
4        0
        ..
44995    2
44996    2
44997    2
44998    2
44999    2
Name: Y, Length: 45000, dtype: int64

In [137]:
y_valid

0        1
1        1
2        0
3        1
4        1
        ..
14995    2
14996    2
14997    1
14998    2
14999    2
Name: Y, Length: 15000, dtype: int64

In [138]:
# Maximum sequence size for BERT is 512

def regular_encode(texts, tokenizer, maxlen = 512):
    enc_di = tokenizer.batch_encode_plus(texts,truncation=True, return_token_type_ids = False,padding='max_length', max_length = maxlen)
    return np.array(enc_di['input_ids'])

In [124]:
#bert large uncased pretrained tokenizer

tokenizer = transformers.BertTokenizer.from_pretrained('bert-large-uncased')

In [125]:
X_train.sample(10)

6350     professional/ better way of header/body/footer...
15097    angular 2 - does subscribing to formcontrol 's...
38736    pass swift variables to javascript my problem ...
44585    awk to match pattern in html and use regex usi...
43989    intend not work in recycleview.onclicklistener...
918      how to proxy to backend server on certain path...
38829    difference between tsconfig.json and tsconfig....
19484    android app uml classes diagram it is useful ?...
7771     i am getting a fatal exception : main i am ver...
9070     one line to check if string or list , then con...
Name: text, dtype: object

In [139]:
#tokenizing the questions descriptions and converting the categories into one hot vectors using tf.keras.utils.to_categorical

Xtrain_encoded = regular_encode(X_train.astype('str'), tokenizer, maxlen = 128)
ytrain_encoded = tf.keras.utils.to_categorical(y_train, num_classes = 3, dtype = 'int32')
Xtest_encoded = regular_encode(X_valid.astype('str'), tokenizer, maxlen = 128)
ytest_encoded = tf.keras.utils.to_categorical(y_valid, num_classes = 3, dtype = 'int32')

In [140]:
def build_model(transformer, loss = 'categorical_crossentropy', max_len = 512):
    input_word_ids = tf.keras.layers.Input(shape = (max_len,), dtype = tf.int32, name = "input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]

    #adding dropout layer
    x = tf.keras.layers.Dropout(0.40)(cls_token)

    #using a dense layer of 3 neurons as the number of unique categories is 3. 
    out = tf.keras.layers.Dense(3, activation = 'sigmoid')(x)

    model = tf.keras.Model(inputs = input_word_ids, outputs = out)
    model.compile(tf.keras.optimizers.Adam(lr = 3e-5), loss = loss, metrics = ['accuracy'])
    return model

In [142]:
#building the model on tpu


transformer_layer = transformers.TFAutoModel.from_pretrained('bert-large-uncased')
model = build_model(transformer_layer, max_len = 128)
model.summary()

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'g

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 128)]             0         
_________________________________________________________________
tf_bert_model (TFBertModel)  TFBaseModelOutputWithPool 335141888 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 1024)]            0         
_________________________________________________________________
dropout_73 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense (Dense)                (None, 3)                 3075      
Total params: 335,144,963
Trainable params: 335,144,963
Non-trainable params: 0
_________________________________________________________________


In [144]:
#creating the training and testing dataset.

BATCH_SIZE = 32
AUTO = tf.data.experimental.AUTOTUNE 
train_dataset = (tf.data.Dataset.from_tensor_slices((Xtrain_encoded, ytrain_encoded)).repeat().shuffle(2048).batch(BATCH_SIZE).prefetch(AUTO))
test_dataset = (tf.data.Dataset.from_tensor_slices(Xtest_encoded).batch(BATCH_SIZE))

In [146]:
#training for 20 epochs

n_steps = Xtrain_encoded.shape[0] // BATCH_SIZE
train_history = model.fit(train_dataset, steps_per_epoch = n_steps, epochs = 10)

Epoch 1/10
   6/1406 [..............................] - ETA: 18:58:20 - loss: 1.1057 - accuracy: 0.4635

KeyboardInterrupt: 

In [None]:
#making predictions 

preds = model.predict(test_dataset, verbose = 1)

#converting the one hot vector output to a linear numpy array.
pred_classes = np.argmax(preds, axis = 1)

In [None]:
print('Prediction Accuracy on Validation dataset: ', np.round(100*metrics.accuracy_score(pred_classes, y_test), 2), '%')