In [40]:
# import libraries
import requests
import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [16]:
nba_url = 'https://www.reddit.com/r/nba.json'
lebron_url = 'https://www.reddit.com/r/lebron.json'
micheal_url = 'https://www.reddit.com/r/michaeljordan.json'
header = {'User-agent': 'subreddit get requests'}

In [2]:
# define function to get num pages of posts from a subreddit, start collecting at a defined after
def reddit_scraper(url, num, after = None):
    posts = []
    # loop through the num pages, each subreddit .json returns 25 posts 
    for page in range(num):
        # initiate params modifier for posts if there no defined after
        if after == None:
            params = {}
        # add in after id for each loop following to ensure no duplicate posts
        else:
            params = {'after': after}
        # call our get request for the posts
        res = requests.get(url, params=params, headers=header)
        # check status code, 200 means posts were successfully downloaded
        if res.status_code == 200:
            # convert request to .json
            new_json = res.json()
            # extend list from the 'children' dictionary for each request
            posts.extend(new_json['data']['children'])
            # update after id
            after = new_json['data']['after']
        else:
            # print status code if not 200
            print(res.status_code)
            break
        # wait 1 second
        time.sleep(1)
        
    # create a new dataframe with the 'data' from each post
    new_df = pd.DataFrame([post['data'] for post in posts])
    
    # print final value of after
    print(f'Final value of after parameter: {after}')
    
    # return the dataframe
    return new_df

In [8]:
new_nba_df = reddit_scraper(nba_url, 10)


Final value of after parameter: t3_1bhxese


In [9]:
# check shape of scraped dataframe
new_nba_df

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,post_hint,preview,url_overridden_by_dest,author_cakeday
0,,nba,"Here is a place to have in depth, x's and o's,...",t2_6vjwa,False,,0,False,[SERIOUS NEXT DAY THREAD] Post-Game Discussion...,"[{'e': 'text', 't': 'Discussion'}]",...,https://www.reddit.com/r/nba/comments/1bk4utf/...,10114648,1.711022e+09,0,,False,,,,
1,,nba,[https://preview.redd.it/ri8yzud7jcyb1.png?wid...,t2_6vjwa,False,,0,False,Celebrating 15 Years of r/NBA: Community Funds...,"[{'e': 'text', 't': 'Announcement'}]",...,https://www.reddit.com/r/nba/comments/1bjubqt/...,10114648,1.710984e+09,0,,False,self,{'images': [{'source': {'url': 'https://extern...,,
2,,nba,,t2_n846u,False,,0,False,[Highlight] Harden finds an open Kawhi for thr...,"[{'e': 'text', 't': 'Highlight'}]",...,https://streamable.com/87asjf,10114648,1.710993e+09,6,{'oembed': {'provider_url': 'https://streamabl...,False,rich:video,{'images': [{'source': {'url': 'https://extern...,https://streamable.com/87asjf,
3,,nba,,t2_znx17,False,,0,False,Cavs’ coach JB Bickerstaff on sports gambling:...,[],...,https://streamable.com/b32hso,10114648,1.710983e+09,5,{'oembed': {'provider_url': 'https://streamabl...,False,rich:video,{'images': [{'source': {'url': 'https://extern...,https://streamable.com/b32hso,
4,,nba,,t2_8nnxmjvi9,False,,0,False,[Youngmisuk] James Harden said he was trying t...,[],...,https://x.com/NotoriousOHM/status/177067672292...,10114648,1.710998e+09,0,,False,link,{'images': [{'source': {'url': 'https://extern...,https://x.com/NotoriousOHM/status/177067672292...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,,nba,Hi all! Long time lurker here. I don’t know if...,t2_l9eyom59,False,,0,False,I Created Interactive NBA Data Dashboards Goin...,[],...,https://www.reddit.com/r/nba/comments/1bikh2z/...,10114648,1.710855e+09,0,,False,,,,
248,,nba,Having a discussion with my friend and wanted ...,t2_u5gfi,False,,0,False,Which duo is better? Embiid and Maxey or Butle...,[],...,https://www.reddit.com/r/nba/comments/1bjzcsv/...,10114648,1.711000e+09,0,,False,,,,
249,,nba,,t2_dqso0a2u,False,,0,False,"34 yr old DeMar DeRozan, in his 15th season, i...",[],...,https://www.nba.com/stats/players/traditional?...,10114648,1.710821e+09,0,,False,,,https://www.nba.com/stats/players/traditional?...,
250,,nba,||\t\t\n|:-:|\t\t\n|[](/DET) **94 - 119** [](...,t2_m2gp7bs4,False,,0,False,[Post Game Thread] The Boston Celtics (54-14) ...,"[{'e': 'text', 't': 'Post Game Thread'}]",...,https://www.reddit.com/r/nba/comments/1bi8crt/...,10114648,1.710812e+09,0,,False,self,{'images': [{'source': {'url': 'https://extern...,,


In [12]:
lebron_df = reddit_scraper(lebron_url, 10)
jordan_df = reddit_scraper(micheal_url, 10)

Final value of after parameter: t3_zecirs
Final value of after parameter: None


In [17]:
jordan_df = reddit_scraper(micheal_url, 10)
jordan_df

Final value of after parameter: t3_4umvrk


Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,is_video,post_hint,url_overridden_by_dest,preview,link_flair_template_id,is_gallery,media_metadata,gallery_data,crosspost_parent_list,crosspost_parent
0,,michaeljordan,,t2_14018z,False,,0,False,Reminder: No sales/self-promotion/spam allowed...,[],...,False,,,,,,,,,
1,,michaeljordan,Related subreddits to check out:\n\n-\tr/TheBi...,t2_14018z,False,,0,False,Reminder: Though content about former Washingt...,[],...,False,,,,,,,,,
2,,michaeljordan,,t2_okx30dau9,False,,0,False,1996 Bowman's Best PSA GEM MT 10! The Goat! 🐐🏆,[],...,False,image,https://i.redd.it/hw43tvg7b6pc1.jpeg,{'images': [{'source': {'url': 'https://previe...,b5bbcf60-6077-11ee-866d-26a8504243d7,,,,,
3,,michaeljordan,,t2_ivw7orsg,False,,0,False,Michael Jordan • Slow-Motion Slam Dunk From Th...,[],...,False,rich:video,https://youtu.be/b0x9P8XjriY?si=189WEASKfAfo8z42,{'images': [{'source': {'url': 'https://extern...,,,,,,
4,,michaeljordan,Can anyone help with MJ signed basketball auth...,t2_s9emuhv,False,,0,False,Can anyone help with MJ signed basketball auth...,[],...,False,,https://www.reddit.com/gallery/1bh0dsa,,800bd51e-6076-11ee-ad7b-c6e06f3b124d,True,"{'gx9w6znaywoc1': {'status': 'valid', 'e': 'Im...","{'items': [{'media_id': 'joz2twnaywoc1', 'id':...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,,michaeljordan,,t2_11xpa7,False,,0,False,Was Magic Johnson Better Than Michael Jordan?,[],...,False,link,https://policeauctionsblog.wordpress.com/2016/...,{'images': [{'source': {'url': 'https://extern...,,,,,,
248,,michaeljordan,,,False,,0,False,Michael Jordan - First NBA Game Highlights - B...,[],...,False,,https://www.youtube.com/watch?v=lMC7Ai09bm8,,,,,,,
249,,michaeljordan,,t2_10gdfk,False,,0,False,Michael Jordan Wallpaper,[],...,False,,https://www.facebook.com/michaeljordanwallpaper/,,,,,,,
250,,michaeljordan,,t2_vvju5,False,,0,False,Michael Jordan: ‘I Can No Longer Stay Silent’ ...,[],...,False,,https://www.youtube.com/watch?v=59MCKl1NEAY,,,,,,,


In [14]:
lebron_df

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,subreddit_subscribers,created_utc,num_crossposts,media,is_video,crosspost_parent_list,crosspost_parent,media_metadata,is_gallery,gallery_data
0,,lebron,,t2_nf1e2d44s,False,,0,False,LeBron poster I designed! Instagram is @csc.dzn💫,[],...,2369,1.710869e+09,0,,False,,,,,
1,,lebron,,t2_73irj95n,False,,0,False,Where can I buy this?,[],...,2369,1.710688e+09,0,,False,,,,,
2,,lebron,"As you all know, king LBJ dominated the game ...",t2_423ypd4d,False,,0,False,lebron legacy question,[],...,2369,1.710339e+09,0,,False,,,,,
3,,lebron,,t2_tn5msefyp,False,,0,False,what if lebron went to the bulls?,[],...,2369,1.710297e+09,0,,False,,,,,
4,,lebron,,t2_8674ljvh,False,,0,False,LEBRON!,[],...,2369,1.709959e+09,0,,False,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,,lebron,,t2_2eezrcdz,False,,0,False,Was Lebrons first movie even good?,[],...,2369,1.672425e+09,0,"{'type': 'youtube.com', 'oembed': {'provider_u...",False,,,,,
246,,lebron,,t2_u3foy2dv,False,,0,False,LeBron can fly,[],...,2369,1.671803e+09,0,"{'type': 'youtube.com', 'oembed': {'provider_u...",False,,,,,
247,,lebron,,t2_16oy4u,False,,0,False,King Lebron Epic Dunk,[],...,2369,1.671451e+09,0,,False,,,,,
248,,lebron,,t2_o3aew,False,,0,False,LeBron reflects on Grant Wahl following his de...,[],...,2369,1.670653e+09,0,,False,,,,,


In [20]:
def extract_features(df):
    df = df[['selftext', 'title', 'subreddit']]
    return df



In [23]:
df_list = [extract_features(lebron_df), extract_features(jordan_df)]


In [24]:
big_df = pd.concat(df_list, ignore_index=True)

In [25]:
big_df

Unnamed: 0,selftext,title,subreddit
0,,LeBron poster I designed! Instagram is @csc.dzn💫,lebron
1,,Where can I buy this?,lebron
2,"As you all know, king LBJ dominated the game ...",lebron legacy question,lebron
3,,what if lebron went to the bulls?,lebron
4,,LEBRON!,lebron
...,...,...,...
497,,Was Magic Johnson Better Than Michael Jordan?,michaeljordan
498,,Michael Jordan - First NBA Game Highlights - B...,michaeljordan
499,,Michael Jordan Wallpaper,michaeljordan
500,,Michael Jordan: ‘I Can No Longer Stay Silent’ ...,michaeljordan


Featurizing Columns

In [28]:


# 2. Apply TF-IDF vectorization to 'title' and 'selftext'
tfidf = TfidfVectorizer()

# Since you want to append them as separate columns, you will vectorize them separately
title_vecs = tfidf.fit_transform(big_df['title'])
selftext_vecs = tfidf.fit_transform(big_df['selftext'])

# Converting sparse matrix to DataFrame (optional step depending on further usage)
# Here we're creating a placeholder DataFrame to illustrate handling of TF-IDF vectors
# The columns names would be numeric and generated based on the TF-IDF feature extraction
title_vecs_df = pd.DataFrame(title_vecs.toarray())
selftext_vecs_df = pd.DataFrame(selftext_vecs.toarray())

Train Test Split

In [39]:
X = hstack([title_vecs, selftext_vecs])
y = pd.get_dummies(big_df["subreddit"]).values
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [49]:
import numpy as np

y
y_train_1d = np.argmax(y_train, axis=1)
y_test_1d = np.argmax(y_test, axis=1)

Logistic Model

In [50]:
# Initialize the Logistic Regression model
# Solver 'lbfgs' works well for small datasets, but you might choose another based on your dataset size and characteristics
# max_iter may need to be increased if the model fails to converge
logistic_model = LogisticRegression(solver='lbfgs', max_iter=1000)

# Fit the model to your data
logistic_model.fit(X_train, y_train_1d)
# Use the trained model to make predictions on the test set
y_pred = logistic_model.predict(X_test)
# Print the classification report
print(classification_report(y_test_1d, y_pred))

# Calculate and print the accuracy
accuracy = accuracy_score(y_test_1d, y_pred)
print(f"Accuracy: {accuracy}")


              precision    recall  f1-score   support

           0       0.86      0.77      0.81        48
           1       0.81      0.89      0.85        53

    accuracy                           0.83       101
   macro avg       0.84      0.83      0.83       101
weighted avg       0.83      0.83      0.83       101

Accuracy: 0.8316831683168316


In [53]:
subreddit_names = pd.get_dummies(big_df["subreddit"]).columns
y_test_categories = np.argmax(y_test, axis=1)
# y_pred_categories is already in the correct format if LogisticRegression.predict() was used
y_pred_categories = y_pred  # Assuming y_pred is the output from logistic_model.predict(X_test)

# Continue with the Accuracy and Classification Report
print("Classification Report:\n", classification_report(y_test_categories, y_pred_categories, target_names=subreddit_names))
print("Accuracy:", accuracy_score(y_test_categories, y_pred_categories))

# For displaying samples, ensure you adjust according to your DataFrame's indexing if needed


Classification Report:
                precision    recall  f1-score   support

       lebron       0.86      0.77      0.81        48
michaeljordan       0.81      0.89      0.85        53

     accuracy                           0.83       101
    macro avg       0.84      0.83      0.83       101
 weighted avg       0.83      0.83      0.83       101

Accuracy: 0.8316831683168316


In [57]:
import random

# Continuing from the previous setup, where test_indices is assumed to hold indices of X_test in big_df
# Here we'll select a few specific indices for demonstration. In practice, you'd use actual indices from X_test.

# Sample a few indices for display, ensure you have a method to relate X_test back to big_df
sample_indices = random.sample(test_indices, 5)

for i, idx in enumerate(sample_indices):
    # Extracting both 'title' and 'selftext' from the original DataFrame
    original_title = big_df.iloc[idx]['title']  # Title text
    original_text = big_df.iloc[idx]['selftext']  # Selftext
    predicted_label = subreddit_names[y_pred_categories[i]]  # Predicted subreddit
    actual_label = subreddit_names[y_test_categories[i]]  # Actual subreddit
    
    print(f"Instance {i+1}:")
    print(f"Title: {original_title}")
    print(f"Selftext: {original_text}")
    print(f"Predicted Subreddit: {predicted_label}")
    print(f"Actual Subreddit: {actual_label}")
    print("-" * 60)


Instance 1:
Title: Is Lebron now the Goat?
Selftext: Peronsally for me he has now surpassed Jordan as the GOAT!
Predicted Subreddit: michaeljordan
Actual Subreddit: michaeljordan
------------------------------------------------------------
Instance 2:
Title: here’s a photo of me slam dunking. not really jordan but his signature move tho. (i’m 13 yo)
Selftext: 
Predicted Subreddit: lebron
Actual Subreddit: lebron
------------------------------------------------------------
Instance 3:
Title: "Man if y'all don't come and get y'alls goat"
Selftext: 
Predicted Subreddit: lebron
Actual Subreddit: michaeljordan
------------------------------------------------------------
Instance 4:
Title: Interesting video 🤔
Selftext: 
Predicted Subreddit: michaeljordan
Actual Subreddit: lebron
------------------------------------------------------------
Instance 5:
Title: LeBron's Path to #1 on the All-Time scoring list ft. Everyone I could think of (I hope you like Gladys Knight)
Selftext: 
Predicted Subr

In [62]:
# Ensure you're using the correct vectorizer instance that was fit on the training data
feature_names = tfidf.get_feature_names_out()

# Assuming logistic_model is your trained model
num_classes = logistic_model.coef_.shape[0]
top_features = 10

for class_index in range(num_classes):
    coefficients = logistic_model.coef_[class_index]
    # Ensure we only consider as many features as we have names for
    top_indices = np.argsort(np.abs(coefficients))[-top_features:]
    
    print(f"Class {class_index}: Most influential words")
    for index in top_indices:
        # Safeguard against out-of-bounds access
        if index < len(feature_names):
            print(f" {feature_names[index]} (Coefficient: {coefficients[index]:.4f})")
        else:
            print(" Index out of bounds, skipped.")
    print("-" * 40)


Class 0: Most influential words
 cruz (Coefficient: -0.6776)
 eastern (Coefficient: 0.7688)
 Index out of bounds, skipped.
 idea (Coefficient: -1.0269)
 cracking (Coefficient: -1.3324)
 conte (Coefficient: -2.3782)
 doesn (Coefficient: 2.4015)
 double (Coefficient: 2.4641)
 core (Coefficient: 3.2165)
 damming (Coefficient: -3.7906)
----------------------------------------


Now Trying With Neural Net

In [63]:
import torch
from scipy.sparse import csr_matrix
from torch.utils.data import TensorDataset, DataLoader

# Function to convert sparse matrix to tensor
def sparse_to_tensor(sparse_matrix):
    sparse_matrix = sparse_matrix.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_matrix.row, sparse_matrix.col)).astype(np.int64)
    )
    values = torch.from_numpy(sparse_matrix.data)
    shape = torch.Size(sparse_matrix.shape)
    return torch.sparse_coo_tensor(indices, values, shape)

# Convert X and y to PyTorch tensors
X_train_tensor = sparse_to_tensor(X_train)
X_test_tensor = sparse_to_tensor(X_test)
y_train_tensor = torch.from_numpy(y_train.astype(np.float32))
y_test_tensor = torch.from_numpy(y_test.astype(np.float32))

# Create TensorDatasets and DataLoaders
train_dataset = TensorDataset(X_train_tensor.to_dense(), y_train_tensor)
test_dataset = TensorDataset(X_test_tensor.to_dense(), y_test_tensor)

batch_size = 64  # Adjust based on your computational resources
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [64]:
import torch.nn as nn
import torch.nn.functional as F

class TextClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, output_dim)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.softmax(x, dim=1)  # Softmax applied at the output layer

# Determine input and output dimensions
input_dim = X_train.shape[1]
output_dim = y_train.shape[1]  # Number of classes

# Initialize the model
model = TextClassifier(input_dim, output_dim)


In [65]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Training loop
epochs = 10  # Number of epochs to train for
for epoch in range(epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        predictions = model(X_batch)
        loss = criterion(predictions, torch.max(y_batch, 1)[1])  # Assuming y_batch is one-hot encoded
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")


Epoch 1, Loss: 0.6898412108421326
Epoch 2, Loss: 0.6662548780441284
Epoch 3, Loss: 0.6242161393165588
Epoch 4, Loss: 0.4987649917602539
Epoch 5, Loss: 0.3927186131477356
Epoch 6, Loss: 0.33178654313087463
Epoch 7, Loss: 0.3353603780269623
Epoch 8, Loss: 0.32081499695777893
Epoch 9, Loss: 0.31420770287513733
Epoch 10, Loss: 0.31545817852020264


In [66]:
# Evaluate the model
model.eval()  # Set the model to evaluation mode

# Tracking variables
correct_predictions = 0
total_predictions = 0

# No gradient updates needed for evaluation
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        predictions = model(X_batch)
        _, predicted_labels = torch.max(predictions, 1)
        _, actual_labels = torch.max(y_batch, 1)
        
        total_predictions += y_batch.size(0)
        correct_predictions += (predicted_labels == actual_labels).sum().item()

accuracy = correct_predictions / total_predictions
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 80.20%


In [67]:
import numpy as np

# Assuming y_train and y_test are one-hot encoded, convert them back to label-encoded format
y_train_labels = np.argmax(y_train, axis=1)
y_test_labels = np.argmax(y_test, axis=1)
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors

# Fit the model to the training data
# Note: For high-dimensional sparse data, converting to dense might be memory-intensive
# Consider using TruncatedSVD or similar techniques to reduce dimensionality if necessary
knn.fit(X_train, y_train_labels)
from sklearn.metrics import accuracy_score

# Predict the labels for the test set
y_pred = knn.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test_labels, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 79.21%


Naive Bayes

In [69]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Initialize the Multinomial Naive Bayes classifier
mnb = MultinomialNB()

# Fit the classifier to the training data
# Note: MultinomialNB works with sparse matrices, so there's no need to convert them to dense
mnb.fit(X_train, y_train_labels)
# Predict the labels for the test set
y_pred = mnb.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test_labels, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 82.18%


KNN, Logistic Regression, and Neural Net All performed similarily. Naive Bayes performed the best