In [None]:
!pip install transformers datasets evaluate accelerate

In [6]:
import polars as pl
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np
import pickle
import transformers
import datasets
import evaluate
import accelerate
import gdown
from tqdm.auto import tqdm

In [7]:
comment_len_thresh = 150
sample_size = 150000
seed = 42

#### Load and sample random comments

##### To use classifier, shuffle random_df with same seed and remove first sample_size samples

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
random_df = pl.read_csv('/content/drive/MyDrive/Datasets/css/random_sample.csv')

In [9]:
random_df = random_df.filter(pl.col("comments").str.lengths() >= comment_len_thresh)
random_df = random_df.sample(fraction=1.0, shuffle=True, seed=seed)

In [10]:
random_df.head()

id,comments,subreddit,sub_id
str,str,str,str
"""f1vbggn""","""eu t ligado qu…","""desabafos""","""t5_395lw"""
"""dnblwg9""","""oh awesome tha…","""Multicopter""","""t5_2u9hy"""
"""dyo9ww9""","""turns out you …","""todayilearned""","""t5_2qqjc"""
"""ch6em32""",""" gt id say lin…","""windows""","""t5_2qh3k"""
"""dvfqxtm""","""if we are to p…","""CryptoCurrency…","""t5_2wlj3"""


In [11]:
remaining_df = random_df[sample_size:]
random_df = random_df[:sample_size]

In [12]:
remaining_df = remaining_df[:sample_size]

In [13]:
remaining_df.head()

id,comments,subreddit,sub_id
str,str,str,str
"""ex6poay""","""i had it disab…","""techsupport""","""t5_2qioo"""
"""dvd3op6""","""yeah i suppose…","""OnePiece""","""t5_2rfz5"""
"""dfcpqc6""","""hay varios col…","""podemos""","""t5_31h78"""
"""djmm5je""","""the answer to …","""Ultralight""","""t5_2s7p2"""
"""d3p04e9""","""sorry man didn…","""oculusnsfw""","""t5_2y44y"""


#### Load partial sports and gaming subs

In [14]:
sports_df = pl.read_csv('/content/drive/MyDrive/Datasets/css/sports_sample.csv')
gaming_df = pl.read_csv('/content/drive/MyDrive/Datasets/css/gaming_subs.csv') #, truncate_ragged_lines=True)

sports_subs = sports_df['subreddit'].unique().to_list()
gaming_subs = []
for i in range(len(gaming_df)):
    gaming_subs.append(gaming_df[i]['Name;Link'].to_list()[0].split('/r/')[-1])
gaming_subs = list(set(gaming_subs))
sports_subs.extend(gaming_subs)

#### Get a better list of sports and gaming subs manually

##### first remove already known subs

In [15]:
manual_list = [
    'leagueoflegends', 'nba', 'soccer', 'nfl', 'DestinyTheGame', 'gaming',
    'DotA2', 'SquaredCircle', 'Overwatch', 'CFB', 'MMA', 'fantasyfootball',
    'NintendoSwitch', 'formula1', '2007scape', 'FortNiteBR', 'FireEmblemHeroes',
    'Competitiveoverwatch', 'Rainbow6', 'pokemontrades', 'reddevils',
    'GlobalOffensiveTrade', 'u_RedditNintendoSwitch', 'dndnext', 'darksouls3',
    'classicwow', 'PUBATTLEGROUNDS', 'Cricket', 'CollegeBasketball',
    'deadbydaylight', 'RocketLeague', 'deadbydaylight', 'Gunners', 'running',
    'ClashRoyale', 'LiverpoolFC', 'fantasybaseball', 'DBZDokkanBattle',
    'bravefrontier', 'pokemongo', 'bloodborne', 'forhonor', 'bicycling',
    'MaddenUltimateTeam', 'feedthebeast', 'gtaonline', 'golf', 'WorldOfWarships',
    'NASCAR', 'grandorder', 'bjj', 'sports', 'tennis', '10s', 'TennisClash',
    'apexlegends', 'Boxing', 'FantasyPL', 'CoDCompetitive', 'chess', 'motorsports',
    'Warhammer40k', 'OverwatchUniversity', 'NoMansSkyTheGame', 'chelseafc',
    'poker', 'SWGalaxyOfHeroes', 'Seaofthieves', 'RocketLeagueExchange',
    'rugbyunion', 'nrl', 'modernwarfare', 'BattlefieldV', '40kLore',
    'MonsterHunterWorld', 'h1z1', 'airsoft', 'csgobetting', 'FakeCollegeFootball',
    'ModernMagic', 'DynastyFF', 'Sexsells', 'AFL', 'FortniteCompetitive',
    'GamerGhazi', 'sportsbetting', 'sportsbook', 'baseball', 'SportsFR', 'broodwar',
    'G2eSports', 'hockey', 'sportsarefun', 'AllCombatSports', 'starcraft', 'aoe2',
    'indiansports', 'EASportsFC', 'NintendoSwitchSports', 'rugbyunion', 'coys',
    'GlobalOffensive', 'esports', 'MirrorSports', 'EA_NHL','discgolf', 'EASPORTSWRC',
]

In [16]:
sports_subs.extend(manual_list)

In [None]:
#dummy_df = random_df.filter(~pl.col("subreddit").is_in(sports_subs))
#dummy_df.to_pandas().groupby('subreddit').agg({"id":"count"}).sort_values("id",ascending=False)[500:520]

#### Build dataset

In [None]:
# break names of subs into ngram
# separate sports, games, other
# one vs all
# bert

data_dict = {'text':[], 'label':[]}

sports = 0
non_sports = 0
class_max = 30000

for i in range(len(random_df)):
    comment = random_df[i]['comments'][0]
    subreddit = random_df[i]['subreddit'][0]
    sample = comment
    if subreddit in sports_subs and sports < class_max:
        label = 1
        sports += 1
        data_dict['text'].append(sample)
        data_dict['label'].append(label)
    #else:
    elif non_sports < class_max:
        label = 0
        non_sports += 1
        data_dict['text'].append(sample)
        data_dict['label'].append(label)

print(non_sports)
print(sports)

30000
30000


In [None]:
from datasets import Dataset
dataset = Dataset.from_dict(data_dict)

In [None]:
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/48000 [00:00<?, ? examples/s]

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    prec = precision.compute(predictions=predictions, references=labels)["precision"]
    rec = recall.compute(predictions=predictions, references=labels)["recall"]

    return {'accuracy': acc, 'precision': prec, 'recall': rec}

In [None]:
id2label = {0: "non_sports", 1: "sports"}
label2id = {"non_sports": 0, "sports": 1}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="bert_classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    max_steps=5000,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps = 1000,
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall
1000,0.4245,0.39982,0.835,0.847049,0.816159
2000,0.3979,0.382718,0.839417,0.851344,0.82101
3000,0.3893,0.377776,0.845333,0.853759,0.832051
4000,0.3127,0.437291,0.844167,0.861875,0.818334
5000,0.3046,0.400548,0.84475,0.834716,0.858314


TrainOutput(global_step=5000, training_loss=0.36893296813964843, metrics={'train_runtime': 2613.1162, 'train_samples_per_second': 30.615, 'train_steps_per_second': 1.913, 'total_flos': 5661971228629248.0, 'train_loss': 0.36893296813964843, 'epoch': 1.67})

In [None]:
!zip -r bert_classifier.zip /content/bert_classifier
!cp -r /content/bert_classifier /content/drive/MyDrive/models/css/

### Classify remaining data with classifier

In [17]:
len(remaining_df)

150000

#### first filter out subs we already know to be sports

In [18]:
random_df_init = remaining_df.filter(~pl.col("subreddit").is_in(sports_subs))

In [19]:
len(random_df_init)

116267

In [20]:
random_df_init.head()

id,comments,subreddit,sub_id
str,str,str,str
"""ex6poay""","""i had it disab…","""techsupport""","""t5_2qioo"""
"""dvd3op6""","""yeah i suppose…","""OnePiece""","""t5_2rfz5"""
"""dfcpqc6""","""hay varios col…","""podemos""","""t5_31h78"""
"""djmm5je""","""the answer to …","""Ultralight""","""t5_2s7p2"""
"""d3p04e9""","""sorry man didn…","""oculusnsfw""","""t5_2y44y"""


#### build dataset from remaining df and classifiy

In [29]:
samples = random_df_init['comments'].to_list()
ids = random_df_init['id'].to_list()
subs = random_df_init['subreddit'].to_list()
subs[0]

'techsupport'

In [30]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/models/css/bert_classifier/checkpoint-3000")

In [31]:
from transformers import AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/models/css/bert_classifier/checkpoint-3000")

In [32]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

In [34]:
bar = tqdm(range(len(samples)))

sub_dict = {}

model.to(device)

for s in range(len(samples)):
    inputs = tokenizer(samples[s], return_tensors="pt", truncation=True).to(device)

    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_class_id = logits.argmax().item()
    cls = model.config.id2label[predicted_class_id]

    sub = subs[s]
    if sub not in sub_dict:
        sub_dict[sub] = {'sport_count':0, 'non_sport_count':0}
    if cls == 'sports':
        sub_dict[sub]['sport_count'] += 1
    else:
        sub_dict[sub]['non_sport_count'] += 1

    bar.update(1)

  0%|          | 0/116267 [00:00<?, ?it/s]

In [37]:
len(sports_subs)

1271

In [41]:
new_sports_subs = []
for sub, counts in sub_dict.items():
    if counts['sport_count'] > counts['non_sport_count']:
        new_sports_subs.append(sub)

In [42]:
len(new_sports_subs)

1972

In [43]:
new_sports_subs

['volleyball',
 'slashdiablo',
 'ultimateskyrim',
 'FFRecordKeeper',
 'Markiplier',
 'Wizard101',
 'Shadowverse',
 'peloton',
 'summonerschool',
 'playark',
 'GrandTheftAutoV_PC',
 'CompetitiveForHonor',
 'martialarts',
 'SVExchange',
 'Sekiro',
 'TheWitness',
 'Barca',
 'stunfisk',
 'NHLHUT',
 'DuelLinks',
 'Imperator',
 'Pauper',
 'Battleborn',
 'OnePieceTC',
 'Zombidle',
 'hogwartswerewolvesB',
 'monsterhunterrage',
 'EverWing',
 'MobiusFF',
 'Redskins',
 'TheGaslightAnthem',
 'Rivenmains',
 'borussiadortmund',
 'skyrimmods',
 'DMAcademy',
 'TwoBestFriendsPlay',
 'CallOfDuty',
 'gwent',
 'ufc',
 'thelongdark',
 'CastleClash',
 'realmadrid',
 'PlayJustSurvive',
 'hockeyplayers',
 'Fireteams',
 'twitchplayspokemon',
 'vainglorygame',
 'NFL_Draft',
 'Vive',
 'GameUpscale',
 'PS4Deals',
 'cynicalbritofficial',
 'Yogscast',
 'destiny2',
 'ManyATrueNerd',
 'skyrimrequiem',
 'horseracing',
 'OrderOfHeroes',
 'horizon',
 'TapTitans2',
 'Amigurumi',
 'spikes',
 'OutreachHPG',
 'Stellaris',
 

In [46]:
final_sports_subs = new_sports_subs + sports_subs

In [47]:
len(final_sports_subs)

3243

In [48]:
len(remaining_df)

150000

In [49]:
random_df_final = remaining_df.filter(~pl.col("subreddit").is_in(final_sports_subs))

In [50]:
len(random_df_final)

106105

In [51]:
random_df_final.head()

id,comments,subreddit,sub_id
str,str,str,str
"""ex6poay""","""i had it disab…","""techsupport""","""t5_2qioo"""
"""dvd3op6""","""yeah i suppose…","""OnePiece""","""t5_2rfz5"""
"""dfcpqc6""","""hay varios col…","""podemos""","""t5_31h78"""
"""djmm5je""","""the answer to …","""Ultralight""","""t5_2s7p2"""
"""d3p04e9""","""sorry man didn…","""oculusnsfw""","""t5_2y44y"""


In [52]:
random_df_final.write_csv('random_sample_no_sports_v1.csv', separator=",")

In [53]:
!cp random_sample_no_sports_v1.csv /content/drive/MyDrive/Datasets/css/