## Import Lib

In [41]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, recall_score, precision_score, f1_score, classification_report, RocCurveDisplay, make_scorer

## Load Data

In [42]:
df_steam_deck = pd.read_json('../data/steam_deck_reddit_posts_1.json')
df_steam_deck_2 = pd.read_json('../data/steam_deck_reddit_posts_2.json')
df_steam_deck_flair = pd.read_json('../data/steam_deck_reddit_posts_flair_1.json')
df_steam_deck_flair_2 = pd.read_json('../data/steam_deck_reddit_posts_flair_2.json')

In [43]:
# Function to extract label to the new column 'tag'
def extract_flair(x):
    if len(x) == 1:
        return x[0]['t']
    elif len(x) == 2:
        return x[1]['t']
    else:
        return 'N/A'

In [44]:
# Extract flair to tag
df_steam_deck['tag'] = df_steam_deck['link_flair_richtext'].map(extract_flair)
df_steam_deck['tag'].head()

0       Picture
1       Picture
2       Picture
3    Discussion
4         Video
Name: tag, dtype: object

In [45]:
df_steam_deck_2['tag'] = df_steam_deck_2['link_flair_richtext'].map(extract_flair)
df_steam_deck_2['tag'].head()

0    Meme / Shitpost
1         Hot Wasabi
2         Discussion
3            Picture
4            Picture
Name: tag, dtype: object

In [46]:
# Combine into 1 df
combined_df = pd.concat([
        df_steam_deck[['title', 'selftext', 'tag']],
        df_steam_deck_2[['title', 'selftext', 'tag']],
        df_steam_deck_flair[['title', 'selftext', 'tag']],
        df_steam_deck_flair_2[['title', 'selftext', 'tag']]
    ], axis=0).reset_index(drop=True)

combined_df

Unnamed: 0,title,selftext,tag
0,Size comparison,My legion go just came in today and dang this ...,Picture
1,Traveling for work and the game wasn't on TV.,"I mean, it's a PC right.",Picture
2,"I know they're not the highest in performance,...",,Picture
3,Just a warning for winter: The Steam’s Deck ba...,It was probably 35° outside(so figure around 5...,Discussion
4,"If you're not playing COCOON on the Deck, you'...",,Video
...,...,...,...
12699,"Sure buddy, it's all ok",,Meta
12700,Good soldiers follow orders,,Meta
12701,Barely any of them are unpopular,,Meta
12702,OG Fantasy Writer,,Meta


## EDA & Preprocessing

In [47]:
combined_df.shape

(12704, 3)

In [48]:
combined_df.head()

Unnamed: 0,title,selftext,tag
0,Size comparison,My legion go just came in today and dang this ...,Picture
1,Traveling for work and the game wasn't on TV.,"I mean, it's a PC right.",Picture
2,"I know they're not the highest in performance,...",,Picture
3,Just a warning for winter: The Steam’s Deck ba...,It was probably 35° outside(so figure around 5...,Discussion
4,"If you're not playing COCOON on the Deck, you'...",,Video


In [49]:
combined_df.isna().sum()

title       0
selftext    0
tag         0
dtype: int64

In [50]:
# Check remaining data
combined_df.shape

(12704, 3)

In [51]:
# Check duplicate posts
combined_df.drop_duplicates(inplace=True)
combined_df.reset_index(drop=True, inplace=True)
combined_df.shape

(3657, 3)

In [52]:
combined_df.head()

Unnamed: 0,title,selftext,tag
0,Size comparison,My legion go just came in today and dang this ...,Picture
1,Traveling for work and the game wasn't on TV.,"I mean, it's a PC right.",Picture
2,"I know they're not the highest in performance,...",,Picture
3,Just a warning for winter: The Steam’s Deck ba...,It was probably 35° outside(so figure around 5...,Discussion
4,"If you're not playing COCOON on the Deck, you'...",,Video


In [53]:
# Check label distribution
combined_df['tag'].value_counts()

tag
Question           635
Tech Support       401
Picture            304
Video              290
Meme / Shitpost    260
News               259
Guide              253
MEGATHREAD         251
Configuration      249
Meta               243
Feature Request    233
Discussion         175
Hot Wasabi         104
Name: count, dtype: int64

In [54]:
# Check empty text
print(combined_df[combined_df['tag'] == '']['tag'].count())
print(combined_df[combined_df['title'] == '']['title'].count())
print(combined_df[combined_df['selftext'] == '']['selftext'].count())

0
0
1649


In [67]:
# Combined text from title, selftext (unused)
combined_df['all_text'] = combined_df['title'] + ' ' + combined_df['selftext']

In [68]:
combined_df.head()

Unnamed: 0,title,selftext,tag,all_text
0,Size comparison,My legion go just came in today and dang this ...,Picture,Size comparison My legion go just came in toda...
1,Traveling for work and the game wasn't on TV.,"I mean, it's a PC right.",Picture,Traveling for work and the game wasn't on TV. ...
2,"I know they're not the highest in performance,...",,Picture,"I know they're not the highest in performance,..."
3,Just a warning for winter: The Steam’s Deck ba...,It was probably 35° outside(so figure around 5...,Discussion,Just a warning for winter: The Steam’s Deck ba...
4,"If you're not playing COCOON on the Deck, you'...",,Video,"If you're not playing COCOON on the Deck, you'..."


In [55]:
# Drop records with empty selftext or title
# combined_df = combined_df.loc[(combined_df['title'] != '') & (combined_df['selftext'] != '')]

In [56]:
# After error analysis, try to combine some tag together
# combined_df['tag'] = combined_df['tag'].map(lambda x: 'QA/Tech Support' if x in ['Question', 'Tech Support'] else x)
# combined_df['tag'].value_counts()

## Modeling: Pre-Evaluation

### Prepare X, y

In [57]:
from sklearn.preprocessing import LabelEncoder

X = combined_df[['selftext', 'title']]
y = combined_df['tag']

In [58]:
X.head()

Unnamed: 0,selftext,title
0,My legion go just came in today and dang this ...,Size comparison
1,"I mean, it's a PC right.",Traveling for work and the game wasn't on TV.
2,,"I know they're not the highest in performance,..."
3,It was probably 35° outside(so figure around 5...,Just a warning for winter: The Steam’s Deck ba...
4,,"If you're not playing COCOON on the Deck, you'..."


In [59]:
# Baseline acc.
y.value_counts(normalize=True)

tag
Question           0.173640
Tech Support       0.109653
Picture            0.083128
Video              0.079300
Meme / Shitpost    0.071097
News               0.070823
Guide              0.069182
MEGATHREAD         0.068635
Configuration      0.068089
Meta               0.066448
Feature Request    0.063713
Discussion         0.047853
Hot Wasabi         0.028439
Name: proportion, dtype: float64

In [60]:
# Set custom stopwords for removing some of the words that are too specific
custom_stop_words = ['steam', 'deck', 'steamdeck'] + [tag.lower() for tag in combined_df['tag'].unique()] # subreddit keyword and tag name
english_stop_words = list(CountVectorizer(stop_words='english').get_stop_words())
all_stop_words = custom_stop_words + english_stop_words

In [61]:
# Prepare vectorizer
vectorizer_list = {
    'CVEC_Eng_Stop': CountVectorizer(stop_words='english'),
    'TVEC_Eng_Stop': TfidfVectorizer(stop_words='english'),
    'CVEC_Custom_Stop': CountVectorizer(stop_words=all_stop_words),
    'TVEC_Custom_Stop': TfidfVectorizer(stop_words=all_stop_words)
}

X_vec_list = {}

for name, vectorizer in vectorizer_list.items():
    X_vec = ColumnTransformer([
            ('selftext_vec', vectorizer, 'selftext'),
            ('title_vec', vectorizer, 'title')
        ],
        remainder='drop',
        n_jobs=-1
    ).fit_transform(X)

    X_vec_list[name] = X_vec

### Pre-evaluate models for further selection + optimization

In [69]:
# Function for generating scores to pre-evaluate models for further selection + optimization
def pre_evaluate_models(est):
    result_df_classif = pd.DataFrame()

    for vec_name, X_vec in X_vec_list.items():
        # Train-Test split
        X_train, X_test, y_train, y_test = train_test_split(X_vec, y, random_state=42, stratify=y)

        row_name = vec_name + ' ' + str(est)
        # Train
        print('Training', row_name, '...')
        est.fit(X_train, y_train)

        y_train_pred = est.predict(X_train)
        y_test_pred = est.predict(X_test)

        # Scoring
        print('Scoring', row_name, '...\n')
        
        result_df_classif.loc[row_name, 'cv_train'] = cross_val_score(est, X_train, y_train, cv=5).mean()
        result_df_classif.loc[row_name, 'cv_test'] = cross_val_score(est, X_test, y_test, cv=5).mean()
        result_df_classif.loc[row_name, 'accuracy_train'] = est.score(X_train, y_train)
        result_df_classif.loc[row_name, 'accuracy_test'] = est.score(X_test, y_test)
        result_df_classif.loc[row_name, 'f1_train'] = f1_score(y_train, y_train_pred, average='micro')
        result_df_classif.loc[row_name, 'f1_test'] = f1_score(y_test, y_test_pred, average='micro')
    
    return result_df_classif

In [23]:
# Loop pre-evaluation on various estimators, put results in dataframe
# estimator_list = [
#     LogisticRegression(n_jobs=-1),
#     MultinomialNB(),
#     # DecisionTreeClassifier(),
#     # BaggingClassifier(n_jobs=-1),
#     RandomForestClassifier(min_samples_leaf=5, n_jobs=-1),
#     AdaBoostClassifier()
# ]

# pre_evaluation_result = pd.DataFrame()
# for estimator in estimator_list:
#     pre_evaluation_result = pd.concat([pre_evaluation_result, pre_evaluate_models(estimator)], axis=0)

In [24]:
# pre_evaluation_result

Result of previous code blocks (skipped because of long runtime)

![Pre Evaluation Result](../image/pre_evaluation_result.png)

In [63]:
# Baseline for model comparison
y.value_counts(normalize=True)

tag
Question           0.173640
Tech Support       0.109653
Picture            0.083128
Video              0.079300
Meme / Shitpost    0.071097
News               0.070823
Guide              0.069182
MEGATHREAD         0.068635
Configuration      0.068089
Meta               0.066448
Feature Request    0.063713
Discussion         0.047853
Hot Wasabi         0.028439
Name: proportion, dtype: float64

## Prepare vectorized data for selected model optimization

In [64]:
# Create custom tokenizer, to do tokenize + lemmatize
import spacy

def tokenize_lemmatize(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_punct and not token.is_stop and token.is_alpha]
    return tokens

### cvec_lemmatized.csv

In [27]:
# Count Vectorizing + Lemmatize
# ct_vectorizer = ColumnTransformer([
#                 ('selftext_vec', CountVectorizer(tokenizer=tokenize_lemmatize), 'selftext'),
#                 ('title_vec', CountVectorizer(tokenizer=tokenize_lemmatize), 'title')
#             ],
#             remainder='drop',
#             n_jobs=-1
#         )

# X_cvec = ct_vectorizer.fit_transform(X)

In [28]:
# Save to csv
# cvec_lemmatized_df = pd.concat([pd.DataFrame(X_cvec.toarray(), columns=ct_vectorizer.get_feature_names_out()), y], axis=1)
# cvec_lemmatized_df.to_csv('../data/cvec_lemmatized.csv', index=False)

### tvec_lemmatized.csv

In [29]:
# TF-IDF Vectorizing + Lemmatize
# ct_vectorizer = ColumnTransformer([
#                 ('selftext_vec', TfidfVectorizer(tokenizer=tokenize_lemmatize), 'selftext'),
#                 ('title_vec', TfidfVectorizer(tokenizer=tokenize_lemmatize), 'title')
#             ],
#             remainder='drop',
#             n_jobs=-1
#         )

# X_tvec = ct_vectorizer.fit_transform(X)

In [30]:
# Save to csv
# tvec_lemmatized_df = pd.concat([pd.DataFrame(X_tvec.toarray(), columns=ct_vectorizer.get_feature_names_out()), y], axis=1)
# tvec_lemmatized_df.to_csv('../data/tvec_lemmatized.csv', index=False)

### tvec_lemmatized_150.csv

In [31]:
# TF-IDF Vectorizing + Lemmatize
# ct_vectorizer = ColumnTransformer([
#                 ('selftext_vec', TfidfVectorizer(tokenizer=tokenize_lemmatize, max_features=150), 'selftext'),
#                 ('title_vec', TfidfVectorizer(tokenizer=tokenize_lemmatize, max_features=150), 'title')
#             ],
#             remainder='drop',
#             n_jobs=-1
#         )

# X_tvec = ct_vectorizer.fit_transform(X)

In [32]:
# Save to csv
# tvec_lemmatized_df = pd.concat([pd.DataFrame(X_tvec.toarray(), columns=ct_vectorizer.get_feature_names_out()), y], axis=1)
# tvec_lemmatized_df.to_csv('../data/tvec_lemmatized_150.csv', index=False)

### tvec_lemmatized_250_max_df_0.2.csv

In [33]:
# TF-IDF Vectorizing + Lemmatize
# ct_vectorizer = ColumnTransformer([
#                 ('selftext_vec', TfidfVectorizer(tokenizer=tokenize_lemmatize, max_features=250, max_df=0.2), 'selftext'),
#                 ('title_vec', TfidfVectorizer(tokenizer=tokenize_lemmatize, max_features=250, max_df=0.2), 'title')
#             ],
#             remainder='drop',
#             n_jobs=-1
#         )

# X_tvec = ct_vectorizer.fit_transform(X)

In [34]:
# Save to csv
# tvec_lemmatized_df = pd.concat([pd.DataFrame(X_tvec.toarray(), columns=ct_vectorizer.get_feature_names_out()), y], axis=1)
# tvec_lemmatized_df.to_csv('../data/tvec_lemmatized_250_max_df_0.2.csv', index=False)

### tvec_lemmatized_500_max_df_0.2.csv

In [35]:
# TF-IDF Vectorizing + Lemmatize
# ct_vectorizer = ColumnTransformer([
#                 ('selftext_vec', TfidfVectorizer(tokenizer=tokenize_lemmatize, max_features=500, max_df=0.2), 'selftext'),
#                 ('title_vec', TfidfVectorizer(tokenizer=tokenize_lemmatize, max_features=500, max_df=0.2), 'title')
#             ],
#             remainder='drop',
#             n_jobs=-1
#         )

# X_tvec = ct_vectorizer.fit_transform(X)

In [36]:
# Save to csv
# tvec_lemmatized_df = pd.concat([pd.DataFrame(X_tvec.toarray(), columns=ct_vectorizer.get_feature_names_out()), y], axis=1)
# tvec_lemmatized_df.to_csv('../data/tvec_lemmatized_500_max_df_0.2.csv', index=False)

### tvec_lemmatized_1000.csv

In [None]:
# TF-IDF Vectorizing + Lemmatize
ct_vectorizer = ColumnTransformer([
                ('selftext_vec', TfidfVectorizer(tokenizer=tokenize_lemmatize, max_features=1000), 'selftext'),
                ('title_vec', TfidfVectorizer(tokenizer=tokenize_lemmatize, max_features=1000), 'title')
            ],
            remainder='drop',
            n_jobs=-1
        )

X_tvec = ct_vectorizer.fit_transform(X)

In [None]:
# Save to csv
tvec_lemmatized_df = pd.concat([pd.DataFrame(X_tvec.toarray(), columns=ct_vectorizer.get_feature_names_out()), y], axis=1)
tvec_lemmatized_df.to_csv('../data/tvec_lemmatized_1000.csv', index=False)

### tvec_lemmatized_1500.csv

In [None]:
# TF-IDF Vectorizing + Lemmatize
# ct_vectorizer = ColumnTransformer([
#                 ('selftext_vec', TfidfVectorizer(tokenizer=tokenize_lemmatize, max_features=1500), 'selftext'),
#                 ('title_vec', TfidfVectorizer(tokenizer=tokenize_lemmatize, max_features=1500), 'title')
#             ],
#             remainder='drop',
#             n_jobs=-1
#         )

# X_tvec = ct_vectorizer.fit_transform(X)

In [None]:
# Save to csv
# tvec_lemmatized_df = pd.concat([pd.DataFrame(X_tvec.toarray(), columns=ct_vectorizer.get_feature_names_out()), y], axis=1)
# tvec_lemmatized_df.to_csv('../data/tvec_lemmatized_1500.csv', index=False)

### tvec_lemmatized_3000.csv

In [None]:
# TF-IDF Vectorizing + Lemmatize
# ct_vectorizer = ColumnTransformer([
#                 ('selftext_vec', TfidfVectorizer(tokenizer=tokenize_lemmatize, max_features=3000), 'selftext'),
#                 ('title_vec', TfidfVectorizer(tokenizer=tokenize_lemmatize, max_features=3000), 'title')
#             ],
#             remainder='drop',
#             n_jobs=-1
#         )

# X_tvec = ct_vectorizer.fit_transform(X)

In [None]:
# Save to csv
# tvec_lemmatized_df = pd.concat([pd.DataFrame(X_tvec.toarray(), columns=ct_vectorizer.get_feature_names_out()), y], axis=1)
# tvec_lemmatized_df.to_csv('../data/tvec_lemmatized_3000.csv', index=False)

### tvec_lemmatized_5000.csv

In [None]:
# TF-IDF Vectorizing + Lemmatize
# ct_vectorizer = ColumnTransformer([
#                 ('selftext_vec', TfidfVectorizer(tokenizer=tokenize_lemmatize, max_features=5000), 'selftext'),
#                 ('title_vec', TfidfVectorizer(tokenizer=tokenize_lemmatize, max_features=5000), 'title')
#             ],
#             remainder='drop',
#             n_jobs=-1
#         )

# X_tvec = ct_vectorizer.fit_transform(X)

In [None]:
# Save to csv
# tvec_lemmatized_df = pd.concat([pd.DataFrame(X_tvec.toarray(), columns=ct_vectorizer.get_feature_names_out()), y], axis=1)
# tvec_lemmatized_df.to_csv('../data/tvec_lemmatized_5000.csv', index=False)

### tvec_lemmatized_10000.csv

In [None]:
# TF-IDF Vectorizing + Lemmatize
# ct_vectorizer = ColumnTransformer([
#                 ('selftext_vec', TfidfVectorizer(tokenizer=tokenize_lemmatize, max_features=10000), 'selftext'),
#                 ('title_vec', TfidfVectorizer(tokenizer=tokenize_lemmatize, max_features=10000), 'title')
#             ],
#             remainder='drop',
#             n_jobs=-1
#         )

# X_tvec = ct_vectorizer.fit_transform(X)

In [None]:
# Save to csv
# tvec_lemmatized_df = pd.concat([pd.DataFrame(X_tvec.toarray(), columns=ct_vectorizer.get_feature_names_out()), y], axis=1)
# tvec_lemmatized_df.to_csv('../data/tvec_lemmatized_10000.csv', index=False)

### tvec_lemmatized_1000_bigram.csv

In [None]:
# TF-IDF Vectorizing + Lemmatize
# ct_vectorizer = ColumnTransformer([
#                 ('selftext_vec', TfidfVectorizer(tokenizer=tokenize_lemmatize, max_features=1000, ngram_range=(1, 2)), 'selftext'),
#                 ('title_vec', TfidfVectorizer(tokenizer=tokenize_lemmatize, max_features=1000, ngram_range=(1, 2)), 'title')
#             ],
#             remainder='drop',
#             n_jobs=-1
#         )

# X_tvec = ct_vectorizer.fit_transform(X)

In [None]:
# Save to csv
# tvec_lemmatized_df = pd.concat([pd.DataFrame(X_tvec.toarray(), columns=ct_vectorizer.get_feature_names_out()), y], axis=1)
# tvec_lemmatized_df.to_csv('../data/tvec_lemmatized_1000_bigram.csv', index=False)