In [None]:
!pip install transformers datasets evaluate accelerate

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.26.0-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.7/270.7 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8

In [None]:
import polars as pl
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np
import pickle
import transformers
import datasets
import evaluate
import accelerate
import gdown

In [None]:
comment_len_thresh = 150
sample_size = 150000
seed = 42

#### Load and sample random comments

##### To use classifier, shuffle random_df with same seed and remove first sample_size samples

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
random_df = pl.read_csv('/content/drive/MyDrive/Datasets/css/random_sample.csv')

In [None]:
random_df = random_df.filter(pl.col("comments").str.lengths() >= comment_len_thresh)
random_df = random_df.sample(fraction=1.0, shuffle=True, seed=seed)

In [None]:
random_df.head()

id,comments,subreddit,sub_id
str,str,str,str
"""f1vbggn""","""eu t ligado qu…","""desabafos""","""t5_395lw"""
"""dnblwg9""","""oh awesome tha…","""Multicopter""","""t5_2u9hy"""
"""dyo9ww9""","""turns out you …","""todayilearned""","""t5_2qqjc"""
"""ch6em32""",""" gt id say lin…","""windows""","""t5_2qh3k"""
"""dvfqxtm""","""if we are to p…","""CryptoCurrency…","""t5_2wlj3"""


In [None]:
remaining_df = random_df[sample_size:]
random_df = random_df[:sample_size]
remaining_df = remaining_df[:sample_size]

In [None]:
remaining_df = remaining_df[:10000]

#### Load partial sports and gaming subs

In [None]:
sports_df = pl.read_csv('/content/drive/MyDrive/Datasets/css/sports_sample.csv')
gaming_df = pl.read_csv('/content/drive/MyDrive/Datasets/css/gaming_subs.csv') #, truncate_ragged_lines=True)

sports_subs = sports_df['subreddit'].unique().to_list()
gaming_subs = []
for i in range(len(gaming_df)):
    gaming_subs.append(gaming_df[i]['Name;Link'].to_list()[0].split('/r/')[-1])
gaming_subs = list(set(gaming_subs))
sports_subs.extend(gaming_subs)

#### Get a better list of sports and gaming subs manually

##### first remove already known subs

In [None]:
manual_list = [
    'leagueoflegends', 'nba', 'soccer', 'nfl', 'DestinyTheGame', 'gaming',
    'DotA2', 'SquaredCircle', 'Overwatch', 'CFB', 'MMA', 'fantasyfootball',
    'NintendoSwitch', 'formula1', '2007scape', 'FortNiteBR', 'FireEmblemHeroes',
    'Competitiveoverwatch', 'Rainbow6', 'pokemontrades', 'reddevils',
    'GlobalOffensiveTrade', 'u_RedditNintendoSwitch', 'dndnext', 'darksouls3',
    'classicwow', 'PUBATTLEGROUNDS', 'Cricket', 'CollegeBasketball',
    'deadbydaylight', 'RocketLeague', 'deadbydaylight', 'Gunners', 'running',
    'ClashRoyale', 'LiverpoolFC', 'fantasybaseball', 'DBZDokkanBattle',
    'bravefrontier', 'pokemongo', 'bloodborne', 'forhonor', 'bicycling',
    'MaddenUltimateTeam', 'feedthebeast', 'gtaonline', 'golf', 'WorldOfWarships',
    'NASCAR', 'grandorder', 'bjj', 'sports', 'tennis', '10s', 'TennisClash',
    'apexlegends', 'Boxing', 'FantasyPL', 'CoDCompetitive', 'chess', 'motorsports',
    'Warhammer40k', 'OverwatchUniversity', 'NoMansSkyTheGame', 'chelseafc',
    'poker', 'SWGalaxyOfHeroes', 'Seaofthieves', 'RocketLeagueExchange',
    'rugbyunion', 'nrl', 'modernwarfare', 'BattlefieldV', '40kLore',
    'MonsterHunterWorld', 'h1z1', 'airsoft', 'csgobetting', 'FakeCollegeFootball',
    'ModernMagic', 'DynastyFF', 'Sexsells', 'AFL', 'FortniteCompetitive',
    'GamerGhazi', 'sportsbetting', 'sportsbook', 'baseball', 'SportsFR', 'broodwar',
    'G2eSports', 'hockey', 'sportsarefun', 'AllCombatSports', 'starcraft', 'aoe2',
    'indiansports', 'EASportsFC', 'NintendoSwitchSports', 'rugbyunion', 'coys',
    'GlobalOffensive', 'esports', 'MirrorSports', 'EA_NHL','discgolf', 'EASPORTSWRC',
]

In [None]:
sports_subs.extend(manual_list)

In [None]:
#dummy_df = random_df.filter(~pl.col("subreddit").is_in(sports_subs))
#dummy_df.to_pandas().groupby('subreddit').agg({"id":"count"}).sort_values("id",ascending=False)[500:520]

#### Build dataset

In [None]:
# break names of subs into ngram
# separate sports, games, other
# one vs all
# bert

data_dict = {'text':[], 'label':[]}

sports = 0
non_sports = 0
class_max = 30000

for i in range(len(random_df)):
    comment = random_df[i]['comments'][0]
    subreddit = random_df[i]['subreddit'][0]
    sample = comment
    if subreddit in sports_subs and sports < class_max:
        label = 1
        sports += 1
        data_dict['text'].append(sample)
        data_dict['label'].append(label)
    #else:
    elif non_sports < class_max:
        label = 0
        non_sports += 1
        data_dict['text'].append(sample)
        data_dict['label'].append(label)

print(non_sports)
print(sports)

30000
30000


In [None]:
from datasets import Dataset
dataset = Dataset.from_dict(data_dict)

In [None]:
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/48000 [00:00<?, ? examples/s]

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    prec = precision.compute(predictions=predictions, references=labels)["precision"]
    rec = recall.compute(predictions=predictions, references=labels)["recall"]

    return {'accuracy': acc, 'precision': prec, 'recall': rec}

In [None]:
id2label = {0: "non_sports", 1: "sports"}
label2id = {"non_sports": 0, "sports": 1}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="bert_classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    max_steps=5000,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps = 1000,
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall
1000,0.4245,0.39982,0.835,0.847049,0.816159
2000,0.3979,0.382718,0.839417,0.851344,0.82101
3000,0.3893,0.377776,0.845333,0.853759,0.832051
4000,0.3127,0.437291,0.844167,0.861875,0.818334
5000,0.3046,0.400548,0.84475,0.834716,0.858314


TrainOutput(global_step=5000, training_loss=0.36893296813964843, metrics={'train_runtime': 2613.1162, 'train_samples_per_second': 30.615, 'train_steps_per_second': 1.913, 'total_flos': 5661971228629248.0, 'train_loss': 0.36893296813964843, 'epoch': 1.67})

In [None]:
!zip -r bert_classifier.zip /content/bert_classifier

  adding: content/bert_classifier/ (stored 0%)
  adding: content/bert_classifier/checkpoint-1000/ (stored 0%)
  adding: content/bert_classifier/checkpoint-1000/rng_state.pth (deflated 25%)
  adding: content/bert_classifier/checkpoint-1000/config.json (deflated 49%)
  adding: content/bert_classifier/checkpoint-1000/model.safetensors (deflated 8%)
  adding: content/bert_classifier/checkpoint-1000/trainer_state.json (deflated 60%)
  adding: content/bert_classifier/checkpoint-1000/tokenizer.json (deflated 71%)
  adding: content/bert_classifier/checkpoint-1000/optimizer.pt (deflated 17%)
  adding: content/bert_classifier/checkpoint-1000/tokenizer_config.json (deflated 76%)
  adding: content/bert_classifier/checkpoint-1000/training_args.bin (deflated 50%)
  adding: content/bert_classifier/checkpoint-1000/special_tokens_map.json (deflated 42%)
  adding: content/bert_classifier/checkpoint-1000/scheduler.pt (deflated 55%)
  adding: content/bert_classifier/checkpoint-1000/vocab.txt (deflated 53%

In [None]:
!cp -r /content/bert_classifier /content/drive/MyDrive/models/css/