In [22]:
import polars as pl
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np
import pickle

In [2]:
comment_len_thresh = 150
sample_size = 500000
seed = 42

#### Load and sample random comments

##### To use classifier, shuffle random_df with same seed and remove first sample_size samples

In [3]:
random_df = pl.read_csv('~/sports-language-in-politics/data/processed/random_sample.csv')

In [6]:
random_df = random_df.filter(pl.col("comments").str.lengths() >= comment_len_thresh)
random_df = random_df.sample(fraction=1.0, shuffle=True, seed=seed)

In [7]:
random_df.head()

id,comments,subreddit,sub_id
str,str,str,str
"""f1vbggn""","""eu t ligado qu…","""desabafos""","""t5_395lw"""
"""dnblwg9""","""oh awesome tha…","""Multicopter""","""t5_2u9hy"""
"""dyo9ww9""","""turns out you …","""todayilearned""","""t5_2qqjc"""
"""ch6em32""",""" gt id say lin…","""windows""","""t5_2qh3k"""
"""dvfqxtm""","""if we are to p…","""CryptoCurrency…","""t5_2wlj3"""


In [8]:
random_df = random_df[:sample_size]

#### Load partial sports and gaming subs

In [9]:
sports_df = pl.read_csv('~/sports-language-in-politics/data/processed/sports_sample.csv')
gaming_df = pl.read_csv('~/sports-language-in-politics/data/processed/gaming_subs.csv', truncate_ragged_lines=True)

sports_subs = sports_df['subreddit'].unique().to_list()
gaming_subs = []
for i in range(len(gaming_df)):
    gaming_subs.append(gaming_df[i]['Name;Link'].to_list()[0].split('/r/')[-1])
gaming_subs = list(set(gaming_subs))
sports_subs.extend(gaming_subs)

#### Get a better list of sports and gaming subs manually

##### first remove already known subs

In [10]:
manual_list = [
    'leagueoflegends', 'nba', 'soccer', 'nfl', 'DestinyTheGame', 'gaming',
    'DotA2', 'SquaredCircle', 'Overwatch', 'CFB', 'MMA', 'fantasyfootball',
    'NintendoSwitch', 'formula1', '2007scape', 'FortNiteBR', 'FireEmblemHeroes',
    'Competitiveoverwatch', 'Rainbow6', 'pokemontrades', 'reddevils',
    'GlobalOffensiveTrade', 'u_RedditNintendoSwitch', 'dndnext', 'darksouls3',
    'classicwow', 'PUBATTLEGROUNDS', 'Cricket', 'CollegeBasketball', 
    'deadbydaylight', 'RocketLeague', 'deadbydaylight', 'Gunners', 'running',
    'ClashRoyale', 'LiverpoolFC', 'fantasybaseball', 'DBZDokkanBattle',
    'bravefrontier', 'pokemongo', 'bloodborne', 'forhonor', 'bicycling',
    'MaddenUltimateTeam', 'feedthebeast', 'gtaonline', 'golf', 'WorldOfWarships',
    'NASCAR', 'grandorder', 'bjj', 'sports', 'tennis', '10s', 'TennisClash',
    'apexlegends', 'Boxing', 'FantasyPL', 'CoDCompetitive', 'chess', 'motorsports',
    'Warhammer40k', 'OverwatchUniversity', 'NoMansSkyTheGame', 'chelseafc',
    'poker', 'SWGalaxyOfHeroes', 'Seaofthieves', 'RocketLeagueExchange',
    'rugbyunion', 'nrl', 'modernwarfare', 'BattlefieldV', '40kLore',
    'MonsterHunterWorld', 'h1z1', 'airsoft', 'csgobetting', 'FakeCollegeFootball',
    'ModernMagic', 'DynastyFF', 'Sexsells', 'AFL', 'FortniteCompetitive',
    'GamerGhazi', 'sportsbetting', 'sportsbook', 'baseball', 'SportsFR', 'broodwar',
    'G2eSports', 'hockey', 'sportsarefun', 'AllCombatSports', 'starcraft', 'aoe2',
    'indiansports', 'EASportsFC', 'NintendoSwitchSports', 'rugbyunion', 'coys', 
    'GlobalOffensive', 'esports', 'MirrorSports', 'EA_NHL','discgolf', 'EASPORTSWRC',
]

In [11]:
sports_subs.extend(manual_list)

In [12]:
#dummy_df = random_df.filter(~pl.col("subreddit").is_in(sports_subs))
#dummy_df.to_pandas().groupby('subreddit').agg({"id":"count"}).sort_values("id",ascending=False)[500:520]

#### Build dataset

In [13]:
samples = []
labels = []

for i in range(len(random_df)):
    comment = random_df[i]['comments'][0]
    subreddit = random_df[i]['subreddit'][0]
    sample = subreddit #+ ' ' + comment
    if subreddit in sports_subs:
        label = 1
    else:
        label = 0
    samples.append(sample)
    labels.append(label)

In [14]:
corpus_train, corpus_test, y_train, y_test = train_test_split(
    samples, np.array(labels), test_size=0.2, random_state=7
)

print('train data size : {}'.format(len(corpus_train)))
print('test data size : {}'.format(len(corpus_test)))

train data size : 400000
test data size : 100000


In [17]:
vectorizer = TfidfVectorizer(min_df=10)
X_train = vectorizer.fit_transform(
    corpus_train
).toarray()  # sparse also works but explanation slicing is not yet supported
X_test = vectorizer.transform(corpus_test).toarray()

#### Logistic regression

In [50]:
model = LogisticRegression(penalty="l2", C=0.1)
model.fit(X_train, y_train)
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     77556
           1       1.00      0.93      0.97     22444

    accuracy                           0.98    100000
   macro avg       0.99      0.97      0.98    100000
weighted avg       0.99      0.98      0.98    100000



##### Save model

In [53]:
# save
with open('/users/ujan/sports-language-in-politics/models/random_classifier_model.pkl','wb') as f:
    pickle.dump(model, f)

In [54]:
# load
#with open('/users/ujan/sports-language-in-politics/models/random_classifier_model.pkl', 'rb') as f:
    #model = pickle.load(f)

#### NN

In [20]:
clf = MLPClassifier(random_state=7, max_iter=10, verbose=True).fit(X_train, y_train)

Iteration 1, loss = 0.05689525
Iteration 2, loss = 0.01030785
Iteration 3, loss = 0.01012086
Iteration 4, loss = 0.01005476
Iteration 5, loss = 0.01003331
Iteration 6, loss = 0.00997878
Iteration 7, loss = 0.00995140
Iteration 8, loss = 0.00993362
Iteration 9, loss = 0.00992322
Iteration 10, loss = 0.00992337




In [21]:
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     77556
           1       1.00      0.99      0.99     22444

    accuracy                           1.00    100000
   macro avg       1.00      0.99      1.00    100000
weighted avg       1.00      1.00      1.00    100000



In [23]:
# save
with open('/users/ujan/sports-language-in-politics/models/random_mlpclassifier_model.pkl','wb') as f:
    pickle.dump(clf, f)