In [13]:
import polars as pl
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np
import pickle

In [62]:
comment_len_thresh = 150
sample_size = 150000
seed = 42

#### Load and sample random comments

##### To use classifier, shuffle random_df with same seed and remove first sample_size samples

In [63]:
random_df = pl.read_csv('~/sports-language-in-politics/data/processed/random_sample.csv')

In [64]:
random_df = random_df.filter(pl.col("comments").str.lengths() >= comment_len_thresh)
random_df = random_df.sample(fraction=1.0, shuffle=True, seed=seed)

In [65]:
random_df.head()

id,comments,subreddit,sub_id
str,str,str,str
"""f1vbggn""","""eu t ligado qu…","""desabafos""","""t5_395lw"""
"""dnblwg9""","""oh awesome tha…","""Multicopter""","""t5_2u9hy"""
"""dyo9ww9""","""turns out you …","""todayilearned""","""t5_2qqjc"""
"""ch6em32""",""" gt id say lin…","""windows""","""t5_2qh3k"""
"""dvfqxtm""","""if we are to p…","""CryptoCurrency…","""t5_2wlj3"""


In [66]:
remaining_df = random_df[sample_size:]
random_df = random_df[:sample_size] 
remaining_df = remaining_df[:sample_size]  

In [67]:
remaining_df = remaining_df[:10000]

#### Load partial sports and gaming subs

In [68]:
sports_df = pl.read_csv('~/sports-language-in-politics/data/processed/sports_sample.csv')
gaming_df = pl.read_csv('~/sports-language-in-politics/data/processed/gaming_subs.csv', truncate_ragged_lines=True)

sports_subs = sports_df['subreddit'].unique().to_list()
gaming_subs = []
for i in range(len(gaming_df)):
    gaming_subs.append(gaming_df[i]['Name;Link'].to_list()[0].split('/r/')[-1])
gaming_subs = list(set(gaming_subs))
sports_subs.extend(gaming_subs)

#### Get a better list of sports and gaming subs manually

##### first remove already known subs

In [69]:
manual_list = [
    'leagueoflegends', 'nba', 'soccer', 'nfl', 'DestinyTheGame', 'gaming',
    'DotA2', 'SquaredCircle', 'Overwatch', 'CFB', 'MMA', 'fantasyfootball',
    'NintendoSwitch', 'formula1', '2007scape', 'FortNiteBR', 'FireEmblemHeroes',
    'Competitiveoverwatch', 'Rainbow6', 'pokemontrades', 'reddevils',
    'GlobalOffensiveTrade', 'u_RedditNintendoSwitch', 'dndnext', 'darksouls3',
    'classicwow', 'PUBATTLEGROUNDS', 'Cricket', 'CollegeBasketball', 
    'deadbydaylight', 'RocketLeague', 'deadbydaylight', 'Gunners', 'running',
    'ClashRoyale', 'LiverpoolFC', 'fantasybaseball', 'DBZDokkanBattle',
    'bravefrontier', 'pokemongo', 'bloodborne', 'forhonor', 'bicycling',
    'MaddenUltimateTeam', 'feedthebeast', 'gtaonline', 'golf', 'WorldOfWarships',
    'NASCAR', 'grandorder', 'bjj', 'sports', 'tennis', '10s', 'TennisClash',
    'apexlegends', 'Boxing', 'FantasyPL', 'CoDCompetitive', 'chess', 'motorsports',
    'Warhammer40k', 'OverwatchUniversity', 'NoMansSkyTheGame', 'chelseafc',
    'poker', 'SWGalaxyOfHeroes', 'Seaofthieves', 'RocketLeagueExchange',
    'rugbyunion', 'nrl', 'modernwarfare', 'BattlefieldV', '40kLore',
    'MonsterHunterWorld', 'h1z1', 'airsoft', 'csgobetting', 'FakeCollegeFootball',
    'ModernMagic', 'DynastyFF', 'Sexsells', 'AFL', 'FortniteCompetitive',
    'GamerGhazi', 'sportsbetting', 'sportsbook', 'baseball', 'SportsFR', 'broodwar',
    'G2eSports', 'hockey', 'sportsarefun', 'AllCombatSports', 'starcraft', 'aoe2',
    'indiansports', 'EASportsFC', 'NintendoSwitchSports', 'rugbyunion', 'coys', 
    'GlobalOffensive', 'esports', 'MirrorSports', 'EA_NHL','discgolf', 'EASPORTSWRC',
]

In [70]:
sports_subs.extend(manual_list)

In [71]:
#dummy_df = random_df.filter(~pl.col("subreddit").is_in(sports_subs))
#dummy_df.to_pandas().groupby('subreddit').agg({"id":"count"}).sort_values("id",ascending=False)[500:520]

#### Build dataset

In [72]:
# break names of subs into ngram 
# separate sports, games, other
# one vs all
# bert

samples = []
labels = []

sports = 0
non_sports = 0
class_max = 30000

for i in range(len(random_df)):
    comment = random_df[i]['comments'][0]
    subreddit = random_df[i]['subreddit'][0]
    sample = comment
    if subreddit in sports_subs and sports < class_max:
        label = 1
        sports += 1
        samples.append(sample)
        labels.append(label)
    #else:
    elif non_sports < class_max:
        label = 0
        non_sports += 1
        samples.append(sample)
        labels.append(label)

print(non_sports)
print(sports)

30000
30000


In [73]:
samples[647]

'what exactly did they nerf stealth has always been a clusterfuck to me and ive never really used it to level because of that i saw a guy who used ivara to stealth kill everything with sleep arrow and wanted to try that but for some reason de made enemies that are asleep still able to trigger alerts if people die near them so she doesnt work anymore figures that happens just as i get all the nitain to build her i dont understand why theyre even touching stealth at all after they said they dont want akkad to be the loot cave of this game theyre just driving people back towards it really goes to show how out of touch de is with their own game '

In [74]:
labels[647]

1

In [76]:
# shuffle true by default
corpus_train, corpus_test, y_train, y_test = train_test_split(
    samples, np.array(labels), test_size=0.2, random_state=7
)

print('train data size : {}'.format(len(corpus_train)))
print('test data size : {}'.format(len(corpus_test)))

train data size : 48000
test data size : 12000


In [77]:
vectorizer = TfidfVectorizer(min_df=10)
X_train = vectorizer.fit_transform(
    corpus_train
).toarray()  # sparse also works but explanation slicing is not yet supported
X_test = vectorizer.transform(corpus_test).toarray()

In [78]:
X_train.shape

(48000, 13605)

#### Logistic regression

In [79]:
model = LogisticRegression(penalty="l2", C=0.1)
model.fit(X_train, y_train)
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.76      0.82      0.79      6001
           1       0.80      0.74      0.77      5999

    accuracy                           0.78     12000
   macro avg       0.78      0.78      0.78     12000
weighted avg       0.78      0.78      0.78     12000



In [None]:
              precision    recall  f1-score   support   # imbalanced data (50k)

           0       0.80      0.99      0.88      7731
           1       0.76      0.14      0.24      2269

    accuracy                           0.80     10000
   macro avg       0.78      0.56      0.56     10000
weighted avg       0.79      0.80      0.74     10000

In [None]:
              precision    recall  f1-score   support   # balanced data (10k each)

           0       0.72      0.80      0.76      1980
           1       0.78      0.69      0.74      2020

    accuracy                           0.75      4000
   macro avg       0.75      0.75      0.75      4000
weighted avg       0.75      0.75      0.75      4000

In [None]:
              precision    recall  f1-score   support   # balanced data (30k each)

           0       0.76      0.82      0.79      6001
           1       0.80      0.74      0.77      5999

    accuracy                           0.78     12000
   macro avg       0.78      0.78      0.78     12000
weighted avg       0.78      0.78      0.78     12000

##### Save model

In [53]:
# save
with open('/users/ujan/sports-language-in-politics/models/random_classifier_model.pkl','wb') as f:
    pickle.dump(model, f)

In [54]:
# load
#with open('/users/ujan/sports-language-in-politics/models/random_classifier_model.pkl', 'rb') as f:
    #model = pickle.load(f)

#### NN

In [82]:
clf = MLPClassifier(random_state=7, max_iter=100, verbose=True).fit(X_train, y_train)

Iteration 1, loss = 0.52595053
Iteration 2, loss = 0.37987010
Iteration 3, loss = 0.33656465
Iteration 4, loss = 0.30864739
Iteration 5, loss = 0.28590840
Iteration 6, loss = 0.26488467
Iteration 7, loss = 0.24502997
Iteration 8, loss = 0.22499003
Iteration 9, loss = 0.20436969
Iteration 10, loss = 0.18420047
Iteration 11, loss = 0.16405107
Iteration 12, loss = 0.14400380
Iteration 13, loss = 0.12495449
Iteration 14, loss = 0.10667348
Iteration 15, loss = 0.08990553
Iteration 16, loss = 0.07543925
Iteration 17, loss = 0.06216989
Iteration 18, loss = 0.05191968
Iteration 19, loss = 0.04335906
Iteration 20, loss = 0.03645985
Iteration 21, loss = 0.03141258
Iteration 22, loss = 0.02683554
Iteration 23, loss = 0.02336977
Iteration 24, loss = 0.02073880
Iteration 25, loss = 0.01924239
Iteration 26, loss = 0.01737673
Iteration 27, loss = 0.01613909
Iteration 28, loss = 0.01507886
Iteration 29, loss = 0.01432628
Iteration 30, loss = 0.01357006
Iteration 31, loss = 0.01290278
Iteration 32, los

In [23]:
# save
#with open('/users/ujan/sports-language-in-politics/models/random_mlpclassifier_model.pkl','wb') as f:
    #pickle.dump(clf, f)

In [18]:
# load
with open('/users/ujan/sports-language-in-politics/models/random_mlpclassifier_model.pkl', 'rb') as f:
    clf = pickle.load(f)

In [83]:
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.77      0.78      0.77      6001
           1       0.77      0.77      0.77      5999

    accuracy                           0.77     12000
   macro avg       0.77      0.77      0.77     12000
weighted avg       0.77      0.77      0.77     12000



In [None]:
              precision    recall  f1-score   support   # balanced data (30k each)

           0       0.77      0.78      0.77      6001
           1       0.77      0.77      0.77      5999

    accuracy                           0.77     12000
   macro avg       0.77      0.77      0.77     12000
weighted avg       0.77      0.77      0.77     12000


#### Classify remaining sample

In [20]:
samples = []

for i in range(len(remaining_df)):
    comment = remaining_df[i]['comments'][0]
    subreddit = remaining_df[i]['subreddit'][0]
    sample = subreddit #+ ' ' + comment
    samples.append(sample)

In [21]:
X_test = vectorizer.transform(samples).toarray()

In [22]:
clf.predict(X_test)

array([0, 0, 0, ..., 0, 1, 0])