In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
file=r'/content/drive/My Drive/Project/cleaned_data.csv'

In [3]:
import pandas as pd
import numpy as np
import ast
from keras.preprocessing.text import Tokenizer
import random
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from keras.layers import GRU, Embedding, Dropout, BatchNormalization, Dense, Input, concatenate, Reshape
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint, LambdaCallback, ReduceLROnPlateau
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report,f1_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
data=pd.read_csv(file)
data

Unnamed: 0,Title,Body,Tag
0,adding scripting functionality to .net applica...,i have a little game written in c#. it uses a ...,['c#']
1,should i use nested classes in this case?,i am working on a collection of classes used f...,['c++']
2,automatically update version number,i would like the version property of my applic...,['c#']
3,how do i connect to a database and loop over a...,what's the simplest way to connect and query a...,['c#']
4,"how to get the value of built, encoded viewstate?",i need to grab the base64-encoded representati...,['c#']
...,...,...,...
706331,"youtube iframe api: no ready call, no error call","i'm trying to detect the ""flash out of date"" e...",['javascript']
706332,how to execute multiline python code from a ba...,i need to extend a shell script (bash). as i a...,['python']
706333,url routing in php (mvc),i am building a custom mvc project and i have ...,['php']
706334,obfuscating code in android studio,under minifyenabled i changed from false to tr...,['android']


In [5]:
#Converting string value into list

data['Tag']=data['Tag'].apply(ast.literal_eval)
data

Unnamed: 0,Title,Body,Tag
0,adding scripting functionality to .net applica...,i have a little game written in c#. it uses a ...,[c#]
1,should i use nested classes in this case?,i am working on a collection of classes used f...,[c++]
2,automatically update version number,i would like the version property of my applic...,[c#]
3,how do i connect to a database and loop over a...,what's the simplest way to connect and query a...,[c#]
4,"how to get the value of built, encoded viewstate?",i need to grab the base64-encoded representati...,[c#]
...,...,...,...
706331,"youtube iframe api: no ready call, no error call","i'm trying to detect the ""flash out of date"" e...",[javascript]
706332,how to execute multiline python code from a ba...,i need to extend a shell script (bash). as i a...,[python]
706333,url routing in php (mvc),i am building a custom mvc project and i have ...,[php]
706334,obfuscating code in android studio,under minifyenabled i changed from false to tr...,[android]


In [6]:
# Perform multi-label binarization

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(data['Tag'])
labels = mlb.classes_
X = data.drop(['Tag'], axis=1)

In [7]:
# Split data into train and test data with stratification as far as possible

try:
  x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=21, stratify=Y)
except:
  x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=21)

x_train.shape, y_train.shape, x_test.shape, y_test.shape

((529752, 2), (529752, 10), (176584, 2), (176584, 10))

In [8]:
# Data augemntation function
def augment_data(text):
  return text + ' '.join(random.sample(text.split(), int(len(text.split()) * 0.2)))

# Apply data augmentation to 'Title' and 'Body' columns
x_train['Title'] = x_train['Title'].apply(augment_data)
x_train['Body'] = x_train['Body'].apply(augment_data)

In [9]:
# Defining a custom tokenizer class
class TextTokenizer:
  def __init__(self, input_texts, min_freq= 5):
    tokenizer= Tokenizer(char_level=False, split=' ')

    input_texts= input_texts.apply(lambda x: x.lower())
    tokenizer.fit_on_texts(input_texts)

    filtered_word_index= {word: index for word, index in tokenizer.word_index.items() if tokenizer.word_counts[word]>= min_freq}
    filtered_word_index['<UNK>']= len(filtered_word_index)+1

    self.word_index= filtered_word_index
    self.vocab_size= len(self.word_index) + 1
    # self.max_len= max(len(word.split()) for word in input_texts)
    self.max_len = max(input_texts.apply(lambda x: len(x.split())))

  def text_to_padded_sequence(self, sentence, max_len= 0):
    seq= []
    sentence= sentence.lower()
    for word in sentence.split():
      index= self.word_index.get(word, self.word_index['<UNK>'])
      seq.append(index)
    if max_len<= 0:
      max_len= self.max_len

    # Ensuring maximum length is maintained
    seq = seq[:max_len]
    seq.extend([0] * (max_len - len(seq)))

    return seq

In [10]:
title_tokenizer= TextTokenizer(x_train['Title'])
body_tokenizer= TextTokenizer(x_train['Body'])

In [11]:
title_tokenizer.vocab_size, title_tokenizer.max_len, body_tokenizer.vocab_size, body_tokenizer.max_len

(25060, 39, 406813, 12870)

In [12]:
# Define maximum sequence lengths for 'Title' and 'Body'

title_max_len = int(np.quantile([len(word_tokenize(sentence)) for sentence in x_train['Title']], 0.98))
body_max_len = int(np.quantile([len(word_tokenize(sentence)) for sentence in x_train['Body']], 0.92))

In [13]:
max_len = {'Title': title_max_len, 'Body': body_max_len}
max_len

{'Title': 23, 'Body': 816}

In [14]:
def data_generator(texts, batch_size, max_len, labels=None):
    num_samples = len(texts)
    while True:
      for offset in range(0, num_samples, batch_size):
        current_batch_size = min(batch_size, num_samples - offset)
        batch_texts = texts[offset:offset+current_batch_size]
        title, body= [], []
        for index, row in batch_texts.iterrows():
          padded_title= title_tokenizer.text_to_padded_sequence(row['Title'], max_len=max_len['Title'])
          padded_body= body_tokenizer.text_to_padded_sequence(row['Body'], max_len=max_len['Body'])
          title.append(padded_title)
          body.append(padded_body)
        title= np.array(title)
        body= np.array(body)
        if labels is None:
            yield {'title_input': title, 'body_input': body}
        else:
            batch_labels = labels[offset:offset+current_batch_size]
            yield {'title_input': title, 'body_input': body}, {'main_output': batch_labels}

In [15]:
# Define RNN model architecture
def RNN():
    title_input = Input(name='title_input', shape=[None])
    title_embedding = Embedding(title_tokenizer.vocab_size, 100, mask_zero=True)(title_input)
    title_out = GRU(200)(title_embedding)

    body_input = Input(name='body_input', shape=[None])
    body_embedding = Embedding(body_tokenizer.vocab_size, 200, mask_zero=True)(body_input)
    body_out = GRU(300)(body_embedding)

    combined = concatenate([title_out, body_out])
    combined_reshaped = Reshape((1, -1))(combined)
    com_out = GRU(300)(combined_reshaped)

    dense1 = Dense(200, activation='relu')(com_out)
    dp1 = Dropout(0.2)(dense1)
    bn = BatchNormalization()(dp1)
    dense2 = Dense(100, activation='relu')(bn)

    final_output = Dense(len(labels), activation='sigmoid', name='main_output')(dense2)

    model = Model(inputs=[title_input, body_input], outputs=final_output)
    return model

In [16]:
# Create model instance
model = RNN()
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 title_input (InputLayer)    [(None, None)]               0         []                            
                                                                                                  
 body_input (InputLayer)     [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 100)            2506000   ['title_input[0][0]']         
                                                                                                  
 embedding_1 (Embedding)     (None, None, 200)            8136260   ['body_input[0][0]']          
                                                          0                                   

In [17]:
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
# Defining callbacks

model_name = "classifier"
checkpoint = ModelCheckpoint(model_name,
                            monitor="val_loss",
                            mode="min",
                            save_best_only = True,
                            verbose=1,
                            save_format='keras')

earlystopping = EarlyStopping(monitor='val_loss',min_delta = 0.0001, patience = 3, verbose = 1, restore_best_weights=True)

learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss',
                                            patience=3,
                                            verbose=1,
                                            factor=0.2,
                                            min_lr=0.00000001)

callbacks= [checkpoint, earlystopping, learning_rate_reduction]

In [19]:
#  Creating a validation set from the train data

try:
  x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=21, stratify=y_train)
except:
  x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=21)

In [20]:
# Define batch size
batch_size = 300

# Define generator instances for train and validation sets
train_generator = data_generator(x_train, batch_size, max_len, y_train)
validation_generator = data_generator(x_val, batch_size, max_len, y_val)

# Calculate steps per epoch and validation steps

steps_per_epoch = np.ceil(len(x_train) / batch_size)
validation_steps = np.ceil(len(x_val) / batch_size)

In [21]:
# Train the model using generator functions
history = model.fit(train_generator,
                    steps_per_epoch=steps_per_epoch,
                    epochs=5,
                    validation_data=validation_generator,
                    validation_steps=validation_steps,
                    callbacks=callbacks,
                    verbose=1)

Epoch 1/5
Epoch 1: val_loss improved from inf to 0.11156, saving model to classifier
Epoch 2/5
Epoch 2: val_loss improved from 0.11156 to 0.10779, saving model to classifier
Epoch 3/5
Epoch 3: val_loss did not improve from 0.10779
Epoch 4/5
Epoch 4: val_loss did not improve from 0.10779
Epoch 5/5
Epoch 5: val_loss did not improve from 0.10779
Restoring model weights from the end of the best epoch: 2.

Epoch 5: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 5: early stopping


In [22]:
test_generator = data_generator(x_test, batch_size, max_len)
test_steps = np.ceil(len(x_test) / batch_size)
# Predict on test data
y_pred = model.predict(test_generator, steps=test_steps)

# Convert predictions to binary labels
threshold = 0.5
y_pred_binary = (y_pred > threshold).astype(int)

len(y_pred_binary)



176584

In [23]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_binary)
print(f"Accuracy: {accuracy:.4f}")

# Calculate F1 score
f1 = f1_score(y_test, y_pred_binary, average='macro')
print(f"F1 score: {f1:.4f}")

# Get classification report
report = classification_report(y_test, y_pred_binary, target_names=labels)
print("Classification Report:")
print(report)

Accuracy: 0.7236
F1 score: 0.8226
Classification Report:
              precision    recall  f1-score   support

     android       0.94      0.91      0.92     22624
          c#       0.81      0.85      0.83     25319
         c++       0.89      0.77      0.83     12007
        html       0.69      0.55      0.61     14688
         ios       0.95      0.88      0.91     11721
        java       0.92      0.76      0.83     28910
  javascript       0.87      0.64      0.74     31122
      jquery       0.69      0.81      0.75     19434
         php       0.92      0.84      0.88     24652
      python       0.93      0.91      0.92     16124

   micro avg       0.86      0.79      0.82    206601
   macro avg       0.86      0.79      0.82    206601
weighted avg       0.86      0.79      0.82    206601
 samples avg       0.85      0.83      0.82    206601



  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
actual_tags = mlb.inverse_transform(y_test)

actual_pred_tags = mlb.inverse_transform(y_pred_binary)

In [26]:
x= random.randint(0, len(y_test)-1)
print(f'Predicting for {x} row: actual tag= {actual_tags[x]} and precited tag= {actual_pred_tags[x]}')

Predicting for 16346 row: actual tag= ('java',) and precited tag= ('java',)
