In [None]:
! pip install accelerate==0.20.3 --quiet
! pip install transformers --quiet
! pip install datasets --quiet
! pip install torch --quiet
! pip install huggingface_hub --quiet

In [None]:
! unzip StackExchange_csv-20231203T042337Z-001

Archive:  StackExchange_csv-20231203T042337Z-001.zip
replace StackExchange_csv/ai.stackexchange.com/.DS_Store? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
! unzip StackExchange_csv-20231203T042337Z-002

Archive:  StackExchange_csv-20231203T042337Z-002.zip
replace StackExchange_csv/softwareengineering.stackexchange.com/Comments.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import transformers
import torch
import regex
from bs4 import BeautifulSoup

Data Processing

In [None]:
def extractPostContents(postHtml: str):
    try:
        postHtml = str(postHtml)
        soup = BeautifulSoup(postHtml, 'html.parser')
        paragraphs = [p.get_text() for p in soup.find_all('p')]
        if len(paragraphs) == 0:
            return ""
        return "".join(paragraphs)
    except Exception as e:
        print(f"error on term {postHtml}")
        print(e)
        return ""

In [None]:
def readData(dataPath:str):
    csvAndTags = [
        (
            pd.read_csv(os.path.join(dataPath, folder, "Posts.csv")),
            folder.split(".")[0]
        ) for folder in os.listdir(dataPath) if os.path.isdir(os.path.join(dataPath, folder)) and "meta" not in folder
        ]
    for csv, tag in csvAndTags:
        csv["content"] = csv["Body"].map(lambda x:  extractPostContents(x))
        csv["label"] = tag
        print(f"read {csv.shape[0]} posts from {tag}")
    concatedDf =  pd.concat([csv for csv, _ in csvAndTags], axis=0)
    return concatedDf

In [None]:
df_posts = readData("StackExchange_csv")

  soup = BeautifulSoup(postHtml, 'html.parser')


read 241652 posts from softwareengineering


  soup = BeautifulSoup(postHtml, 'html.parser')


read 43294 posts from history


  soup = BeautifulSoup(postHtml, 'html.parser')


read 64054 posts from movies


  soup = BeautifulSoup(postHtml, 'html.parser')


read 271077 posts from gaming


  soup = BeautifulSoup(postHtml, 'html.parser')


read 25296 posts from ai


  soup = BeautifulSoup(postHtml, 'html.parser')


read 91119 posts from music


In [None]:
df_posts.head()

Unnamed: 0,Id,OwnerUserId,LastEditorUserId,PostTypeId,AcceptedAnswerId,Score,ParentId,ViewCount,AnswerCount,CommentCount,...,ContentLicense,Body,FavoriteCount,CreationDate,CommunityOwnedDate,ClosedDate,LastEditDate,LastActivityDate,content,label
0,1,6.0,226.0,1,13.0,99,,64367.0,34.0,10,...,CC BY-SA 2.5,<p>A coworker of mine believes that <em>any</e...,,2010-09-01T19:34:48.000,2011-01-31T09:04:54.130,2012-11-27T20:11:51.580,2011-11-25T22:32:41.300,2012-11-27T19:29:27.740,A coworker of mine believes that any use of in...,softwareengineering
1,3,11.0,11.0,2,,29,1.0,,,17,...,CC BY-SA 2.5,"<p>Ideally, code should be so well coded that ...",,2010-09-01T19:36:50.053,2011-01-31T09:04:54.130,,2010-09-01T20:41:14.273,2010-09-01T20:41:14.273,"Ideally, code should be so well coded that it ...",softwareengineering
2,4,,,1,26.0,66,,9946.0,12.0,2,...,CC BY-SA 2.5,<p>When starting a project for a company that'...,,2010-09-01T19:37:39.957,2022-11-03T13:52:50.803,,2010-09-01T19:45:26.117,2013-03-20T19:59:57.770,When starting a project for a company that's n...,softwareengineering
3,7,21.0,,2,,10,1.0,,,0,...,CC BY-SA 2.5,"<p>I think the answer is the usual ""It depends...",,2010-09-01T19:42:16.797,2011-01-31T09:04:54.130,,,2010-09-01T19:42:16.797,"I think the answer is the usual ""It depends"" o...",softwareengineering
4,9,17.0,666.0,1,,39,,13902.0,24.0,1,...,CC BY-SA 2.5,"<p>Sometimes, the things I have to do for my j...",,2010-09-01T19:43:04.957,2011-01-04T21:22:43.957,2012-11-13T19:09:35.853,2010-09-11T09:35:35.957,2011-01-04T21:22:43.957,"Sometimes, the things I have to do for my job ...",softwareengineering


In [None]:
df_posts.shape

(736492, 24)

In [None]:
import datasets
from sklearn.preprocessing import OneHotEncoder


all_labels = df_posts["label"].unique().tolist()
num_labels = df_posts.label.nunique()

In [None]:
all_labels

['softwareengineering', 'history', 'movies', 'gaming', 'ai', 'music']

In [None]:
id2label = {idx:label for idx, label in enumerate(all_labels)}
label2id = {label:idx for idx, label in enumerate(all_labels)}

Owing to limited GPU RAM, only a small part of the original data were used.

In [None]:
def reduceData(df:pd.DataFrame, dropRatio:float = 0.9, randomState = 42):
    dfCopy = df.copy(deep=True)
    np.random.seed(randomState)
    mask = np.random.rand(dfCopy.shape[0]) < dropRatio
    dfCopy = dfCopy[~mask]
    return dfCopy


def train_test_split(df:pd.DataFrame, test_size=0.2, random_state=42):

    np.random.seed(random_state)
    mask = np.random.rand(len(df)) < test_size
    selectedCols = ["content", "label"]
    df_train = df[~mask][selectedCols]
    df_test = df[mask][selectedCols]

    ds_train = datasets.Dataset.from_pandas(
        df_train,
        split="train",
        preserve_index = False,
        features=datasets.Features(
            {
                # "__index_level_0__": datasets.Value("int64"),
                "content": datasets.Value("string"),
                "label": datasets.ClassLabel(names=all_labels,)
            }
        )
    )

    ds_test = datasets.Dataset.from_pandas(
        df_test,
        split="test",
        preserve_index = False,
        features=datasets.Features(
            {
                # "__index_level_0__": datasets.Value("int64"),
                "content": datasets.Value("string"),
                "label": datasets.ClassLabel(names=all_labels,)
            }
        )
    )
    return ds_train, ds_test


df_posts_reduced = reduceData(df_posts)

In [None]:
ds_train, ds_test = train_test_split(df_posts_reduced)

In [None]:
ds_train

Dataset({
    features: ['content', 'label'],
    num_rows: 58881
})

In [None]:
np.unique(ds_test['label'])

array([0, 1, 2, 3, 4, 5])

In [None]:
ds_train.features

{'content': Value(dtype='string', id=None),
 'label': ClassLabel(names=['softwareengineering', 'history', 'movies', 'gaming', 'ai', 'music'], id=None)}

tokenization

In [None]:
from transformers import AutoTokenizer

tokenMaxLen = 256
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

ds_train = ds_train.map(
    lambda x: tokenizer(x["content"], truncation=True, padding=True, max_length=tokenMaxLen),
    batched=True,
)
ds_train.set_format(columns=["input_ids", "attention_mask", "label"])

ds_test = ds_test.map(
    lambda x: tokenizer(x["content"], truncation=True, padding=True, max_length=tokenMaxLen),
    batched=True,
)
ds_test.set_format(columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/58881 [00:00<?, ? examples/s]

Map:   0%|          | 0/14623 [00:00<?, ? examples/s]

In [None]:
ds_test["label"][0]

0

In [None]:
def oneHotEncoding(dataset, labelNum):
    label = dataset["label"]
    res = [
          [float(1) if val == i else float(0) for i in range(labelNum)] for val in label
        ]
    dataset["label"] = res
    return dataset


ds_train = ds_train.map(
    lambda x: oneHotEncoding(x, num_labels),
    batched=True,
)
ds_test = ds_test.map(
    lambda x: oneHotEncoding(x, num_labels),
    batched=True,
)

Map:   0%|          | 0/58881 [00:00<?, ? examples/s]

Map:   0%|          | 0/14623 [00:00<?, ? examples/s]

In [None]:
ds_train["label"][0]

[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]

fine tuning the BERT model for multiclass classification

In [None]:
from transformers import AutoModelForSequenceClassification
import torch

learningRate = 2e-5

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-cased",
    num_labels=num_labels,
    problem_type="multi_label_classification",
    id2label=id2label,
    label2id=label2id
  )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import Trainer, TrainingArguments


batch_size = 16

args = TrainingArguments(
    "BDAI",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=learningRate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    push_to_hub=True,
    hub_model_id="Chaconne/BDAI",
    push_to_hub_token="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",
)

trainer = Trainer(
    model,
    args,
    train_dataset=ds_train,
    eval_dataset=ds_test,
    tokenizer=tokenizer,
)
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.0487,0.044623
2,0.0235,0.039681


TrainOutput(global_step=7362, training_loss=0.04896970832843361, metrics={'train_runtime': 5832.0775, 'train_samples_per_second': 20.192, 'train_steps_per_second': 1.262, 'total_flos': 1.5492798445012992e+16, 'train_loss': 0.04896970832843361, 'epoch': 2.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.03968135267496109,
 'eval_runtime': 229.8726,
 'eval_samples_per_second': 63.614,
 'eval_steps_per_second': 3.976,
 'epoch': 2.0}

In [None]:
def predict(text):
  encoding = tokenizer(text, return_tensors="pt")
  encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

  trainer.model.eval()
  outputs = trainer.model(**encoding)
  logits = outputs.logits

  # apply sigmoid + threshold
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(logits.squeeze().cpu().detach())
  predictions = np.zeros(probs.shape)
  # predictions[np.where(probs >= 0.5)] = 1
  predictions[np.argmax(probs)] = 1
  # turn predicted id's into actual label names
  predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
  return predicted_labels


def oneHotDecode(encoded, labelNames):
    res = labelNames[np.argmax(encoded)]
    return res


def getSamplePredictions(dataset, index):
  res = []
  for i in index:
    res.append([
        dataset["content"][i], oneHotDecode(dataset["label"][i], all_labels), predict(dataset["content"][i])
    ])
  return res

Example predictions

In [None]:
sample = getSamplePredictions(
    ds_test,
    np.random.choice(len(ds_test), 5)
)

sample

[['This reddit thread should tell you what you need to know:\nhttps://www.reddit.com/r/dragonquest/comments/65o3zo/how_to_unlock_the_dlc_for_dragon_quest_9_in_2017/What you need. A copy of Dragon Quest 9 https://www.amazon.com/Dragon-Quest-IX-Sentinels-Nintendo-DS/dp/B002I0EH6I A computer with USB ports and access to the Internet. The Dragon Quest 9 save editor http://www.woodus.com/den/games/dq9ds/save_editor.php A DS save dongle https://www.amazon.com/NDS-Adapter-Plus-Nintendo-Windows-Pc/dp/B00XVO0I36 http://www.nds-card.com/ProShow.asp?ProID=440Step 1 Download the Dragon Quest 9 save editor and buy a DS save dongle. Step 2 Plug your Dragon Quest 9 cart into your PC using your DS save dongle. Step 3 open the Dragon Quest 9 save editor and follow this guide https://www.gamefaqs.com/boards/937281-dragon-quest-ix-sentinels-of-the-starry-skies/71126194 Step 4 play the DLC :)',
  'gaming',
  ['gaming']],
 ["I am developing an API that has one call that accepts a big JSON object.Based on t