In [2]:
import pandas as pd

langs= ['c','java','python','php','javascript']

df_all = pd.DataFrame()

for lang in langs:
    df = pd.read_json(f'../datasets/stackoverflow_all_data/{lang}.json', orient='records', lines=True)
    df_n = df[df['label'] == 0]
    df_p = df[df['label'] == 1]

    print(len(df_n))
    print(len(df_p))

    if df_n.shape[0] > df_p.shape[0]:
        df_n_downsampled = df_n.sample(df_p.shape[0])
        df_b = pd.concat([df_n_downsampled, df_p])
    else :
        df_p_downsampled = df_p.sample(df_n.shape[0])
        df_b = pd.concat([df_p_downsampled, df_n])
    print(len(df_b))
    df_all = pd.concat([df_b,df_all], axis=0)
print(len(df_all))

200000
115535
231070
200000
200000
400000
200000
119619
239238
200000
53925
107850
200000
48475
96950
200000
110446
220892
1296000


In [3]:
df_all['tags'] = df_all['tags'].apply(lambda x: ' '.join(x.split('|')))

In [4]:
import pandas as pd
import re
from bs4 import BeautifulSoup

# Clean data function to keep only valid ASCII
def clean_data(data):
    def is_valid_ascii(s):
        try:
            s.encode('ascii')
            return True
        except UnicodeEncodeError:
            return False

    mask = data['body'].apply(is_valid_ascii)
    cleaned_df = data[mask].copy()
    return cleaned_df

# Function to preprocess text
def preprocess_text(text):
    text = re.sub(r'\\n|\\r|\\t|', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to extract code blocks
def extract_code_blocks(text):
    code_blocks = re.findall(r'<pre><code>(.*?)</code></pre>', text, re.DOTALL)
    return code_blocks

# Function to replace code blocks
def replace_code_blocks(text):
    desc = re.sub(r'<pre><code>(.*?)</code></pre>', '', text, re.DOTALL)
    return desc

# Function to parse HTML
def html_parser(text):
    text_html = BeautifulSoup(text, 'html.parser').get_text()
    text_html = re.sub(r"'\\+\'","'",text_html)
    return text_html

# Function to remove rows with empty code blocks
def remove_rows_with_empty_code(row):
    if all(item == '' for item in row['code']):
        return False
    else:
        return True

# Clean the data
df_all = clean_data(df_all)

# Apply transformations
df_all.loc[:, 'body'] = df_all['body'].apply(preprocess_text)
df_all.loc[:, 'code'] = df_all['body'].apply(extract_code_blocks)
df_all.loc[:, 'desc'] = df_all['body'].apply(replace_code_blocks)
df_all.loc[:, 'desc'] = df_all['desc'].apply(html_parser)
df_all.loc[:, 'code'] = df_all['code'].apply(lambda x: [html_parser(i) for i in x])

# Remove rows with empty code blocks
df_all = df_all[df_all.apply(lambda row: remove_rows_with_empty_code(row), axis=1)]

# Join code blocks and drop 'body' column
df_all.loc[:, 'code'] = df_all['code'].apply(lambda x: ' '.join(x))
df_all = df_all.drop('body', axis=1)

# Function to remove invalid unicode characters
def remove_invalid_unicode(text):
    return text.encode('utf-8', 'ignore').decode('utf-8')

# Apply function to remove invalid unicode characters from text columns
df_all['desc'] = df_all['desc'].apply(remove_invalid_unicode)
df_all['code'] = df_all['code'].apply(remove_invalid_unicode)

  text_html = BeautifulSoup(text, 'html.parser').get_text()
  text_html = BeautifulSoup(text, 'html.parser').get_text()


In [None]:
import os.path

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report


import torch
from tqdm import tqdm
from transformers import BertTokenizer
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import numpy as np
from sklearn.metrics import accuracy_score

train_batch_size = 16
valid_batch_size = 16

# Load the data
# handle data
df_negative = df_all[df_all['label'] == 0]
df_positive = df_all[df_all['label'] == 1]

print(len(df_negative))
print(len(df_positive))

if df_negative.shape[0] > df_positive.shape[0]:
    df_negative_downsampled = df_negative.sample(df_positive.shape[0])
    df_balanced = pd.concat([df_negative_downsampled, df_positive])
else :
    df_positive_downsampled = df_positive.sample(df_negative.shape[0])
    df_balanced = pd.concat([df_positive_downsampled, df_negative])

print(len(df_balanced))

# Combine 'desc' and 'code' columns
df_balanced['text'] = df_balanced['tags'] +": "+  df_balanced["title"] + ' description: ' + df_balanced['desc'] + ' code snippet: ' + df_balanced['code']
df_balanced['text'] = df_balanced['text'].apply(lambda x: x.lower())

df_balanced = df_balanced.drop(['title','tags','code','desc'],axis =1)

df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)

print(len(df_balanced))



# Split the data into training and validation sets
train_df, valid_test_df = train_test_split(df_balanced, test_size=0.3, random_state=42)
valid_df, test_df = train_test_split(valid_test_df,test_size=0.5, random_state=42)

print(len(train_df))

# Initialize the BERT tokenizer
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base', do_lower_case=True)


# Tokenize the text data
def encode_data(texts, labels, max_length=512):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    return TensorDataset(input_ids, attention_masks, labels)


train_dataset = encode_data(train_df['text'].tolist(), train_df['label'].tolist())
valid_dataset = encode_data(valid_df['text'].tolist(), valid_df['label'].tolist())

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=train_batch_size)
valid_dataloader = DataLoader(valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=valid_batch_size)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Load BERT with a classification head
model = RobertaForSequenceClassification.from_pretrained(
    "microsoft/codebert-base",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
)

model.cuda()

# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Function to calculate accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Training loop
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    print('-' * 10)

    model.train()
    total_train_loss = 0
    total_train_accuracy = 0


    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits

        total_train_loss += loss.item()

        loss.backward()

        optimizer.step()
        scheduler.step()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_train_accuracy += flat_accuracy(logits, label_ids)

    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_train_accuracy = total_train_accuracy / len(train_dataloader)

    print(f"Training loss: {avg_train_loss}")
    print(f"Training accuracy: {avg_train_accuracy}")

    model.eval()
    total_valid_accuracy = 0
    total_valid_loss = 0

    for step, batch in enumerate(valid_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits

        total_valid_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_valid_accuracy += flat_accuracy(logits, label_ids)

    avg_valid_loss = total_valid_loss / len(valid_dataloader)
    avg_valid_accuracy = total_valid_accuracy / len(valid_dataloader)

    print(f"Validation loss: {avg_valid_loss}")
    print(f"Validation accuracy: {avg_valid_accuracy}")

# Save the trained model
model.save_pretrained('./model_saved')

# Load the model for evaluation
model = RobertaForSequenceClassification.from_pretrained('./model_saved')
model.cuda()

# Evaluate on the test set
test_dataset = encode_data(test_df['text'].tolist(), test_df['label'].tolist())
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=valid_batch_size)

model.eval()
total_test_accuracy = 0
predictions, true_labels = [], []

for step, batch in enumerate(test_dataloader):
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        logits = outputs.logits

    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    predictions.extend(np.argmax(logits, axis=1))
    true_labels.extend(label_ids)

    total_test_accuracy += flat_accuracy(logits, label_ids)
    
# Calculate precision, recall, and F1-score
precision = precision_score(true_labels, predictions, average='weighted')
recall = recall_score(true_labels, predictions, average='weighted')
f1 = f1_score(true_labels, predictions, average='weighted')

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# For detailed classification report
print(classification_report(true_labels, predictions, target_names=['class_0', 'class_1']))

avg_test_accuracy = total_test_accuracy / len(test_dataloader)

print(f"Test accuracy: {avg_test_accuracy}")

431421
428011
856022
856022
599215


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/1
----------
