In [40]:
# import libraries
import requests
import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [95]:
nba_url = 'https://www.reddit.com/r/nba.json'
lebron_url = 'https://www.reddit.com/r/lebron.json'
micheal_url = 'https://www.reddit.com/r/michaeljordan.json'
kobe_url = "https://www.reddit.com/r/KobeBryant24.json"
header = {'User-agent': 'subreddit get requests'}

In [2]:
# define function to get num pages of posts from a subreddit, start collecting at a defined after
def reddit_scraper(url, num, after = None):
    posts = []
    # loop through the num pages, each subreddit .json returns 25 posts 
    for page in range(num):
        # initiate params modifier for posts if there no defined after
        if after == None:
            params = {}
        # add in after id for each loop following to ensure no duplicate posts
        else:
            params = {'after': after}
        # call our get request for the posts
        res = requests.get(url, params=params, headers=header)
        # check status code, 200 means posts were successfully downloaded
        if res.status_code == 200:
            # convert request to .json
            new_json = res.json()
            # extend list from the 'children' dictionary for each request
            posts.extend(new_json['data']['children'])
            # update after id
            after = new_json['data']['after']
        else:
            # print status code if not 200
            print(res.status_code)
            break
        # wait 1 second
        time.sleep(1)
        
    # create a new dataframe with the 'data' from each post
    new_df = pd.DataFrame([post['data'] for post in posts])
    
    # print final value of after
    print(f'Final value of after parameter: {after}')
    
    # return the dataframe
    return new_df

Make a function here that you can input a player name, and it will return a subreddit url for further df creation. Need some kind of regular expression usage to match player names with closest available subreddit

In [102]:
def find_url(player_name):
    pass

Scraping subreddits for players. In the future, add a function here to type in a players name, and automatically return a dataframe associated with the subreddit of that players name.

In [96]:
lebron_df = reddit_scraper(lebron_url, 10)
jordan_df = reddit_scraper(micheal_url, 10)
kobe_df = reddit_scraper(kobe_url, 10)

Final value of after parameter: t3_zhky9j
Final value of after parameter: t3_4uvx6f
Final value of after parameter: t3_13m4sin


In [103]:
def extract_features(df):
    df = df[['selftext', 'title', 'subreddit']]
    return df



In [104]:
df_list = [extract_features(lebron_df), extract_features(jordan_df), extract_features(kobe_df)]


In [226]:

def make_big_df_from_urls(player_url_list):
    df_list = []
    for url in player_url_list:
        new_df = reddit_scraper(url)
        df_list.append(new_df)

    big_df = pd.concat(df_list, ignore_index=True)
    big_df = extract_features(big_df)
    return big_df
    


In [105]:
big_df = pd.concat(df_list, ignore_index=True)

Cleaning Before We Featurize

In [168]:
big_df = big_df.dropna(subset=['selftext', 'title'], how='all')

Featurizing Columns

In [275]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize two separate vectorizers
tfidf_title = TfidfVectorizer(max_df=0.95, min_df=2, max_features=5000)
tfidf_selftext = TfidfVectorizer(max_df=0.95, min_df=2, max_features=5000)
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, max_features=5000)

# Fit and transform separately
title_vecs = tfidf_title.fit_transform(big_df['title'])
selftext_vecs = tfidf_selftext.fit_transform(big_df['selftext'])

# Combine the vectors
X = hstack([title_vecs, selftext_vecs])



Experiment: Trying combining both text column into one and then vectorizing

Conclusion: When using the combined column, most probable words for associated columns are more what you would expect. ie kobe for kobe. However, overall accuracy does go down

In [276]:
big_df['selftext'] = big_df['selftext'].fillna('')
big_df['title'] = big_df['title'].fillna('')

# Concatenate 'selftext' and 'title' into a new column 'combined_text'
big_df['combined_text'] = big_df['title'] + " " + big_df['selftext']


In [289]:
big_df.to_csv('Kobe_Jordan_Lebron.csv', index=False)

In [277]:
combined_vecs = tfidf.fit_transform(big_df['combined_text'])

Train Test Split

In [278]:

X = hstack([combined_vecs])
y = pd.get_dummies(big_df["subreddit"]).values
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train


In [279]:
import numpy as np

y
y_train_1d = np.argmax(y_train, axis=1)
y_test_1d = np.argmax(y_test, axis=1)

Logistic Model

In [280]:
# Initialize the Logistic Regression model
# Solver 'lbfgs' works well for small datasets, but you might choose another based on your dataset size and characteristics
# max_iter may need to be increased if the model fails to converge
logistic_model = LogisticRegression(solver='lbfgs', max_iter=1000)

# Fit the model to your data
logistic_model.fit(X_train, y_train_1d)
# Use the trained model to make predictions on the test set
y_pred = logistic_model.predict(X_test)
# Print the classification report
print(classification_report(y_test_1d, y_pred))

# Calculate and print the accuracy
accuracy = accuracy_score(y_test_1d, y_pred)
print(f"Accuracy: {accuracy}")


              precision    recall  f1-score   support

           0       0.84      0.84      0.84        43
           1       0.78      0.90      0.83        50
           2       0.88      0.76      0.81        58

    accuracy                           0.83       151
   macro avg       0.83      0.83      0.83       151
weighted avg       0.83      0.83      0.83       151

Accuracy: 0.8278145695364238


In [248]:
subreddit_names = pd.get_dummies(big_df["subreddit"]).columns
y_test_categories = np.argmax(y_test, axis=1)
# y_pred_categories is already in the correct format if LogisticRegression.predict() was used
y_pred_categories = y_pred  # Assuming y_pred is the output from logistic_model.predict(X_test)

# Continue with the Accuracy and Classification Report
print("Classification Report:\n", classification_report(y_test_categories, y_pred_categories, target_names=subreddit_names))
print("Accuracy:", accuracy_score(y_test_categories, y_pred_categories))

# For displaying samples, ensure you adjust according to your DataFrame's indexing if needed


Classification Report:
                precision    recall  f1-score   support

 KobeBryant24       0.88      0.81      0.84        43
       lebron       0.75      0.92      0.83        50
michaeljordan       0.90      0.78      0.83        58

     accuracy                           0.83       151
    macro avg       0.84      0.84      0.84       151
 weighted avg       0.84      0.83      0.83       151

Accuracy: 0.8344370860927153


In [249]:
import random

# Continuing from the previous setup, where test_indices is assumed to hold indices of X_test in big_df
# Here we'll select a few specific indices for demonstration. In practice, you'd use actual indices from X_test.

# Sample a few indices for display, ensure you have a method to relate X_test back to big_df
sample_indices = random.sample(test_indices, 5)

for i, idx in enumerate(sample_indices):
    # Extracting both 'title' and 'selftext' from the original DataFrame
    original_title = big_df.iloc[idx]['title']  # Title text
    original_text = big_df.iloc[idx]['selftext']  # Selftext
    predicted_label = subreddit_names[y_pred_categories[i]]  # Predicted subreddit
    actual_label = subreddit_names[y_test_categories[i]]  # Actual subreddit
    
    print(f"Instance {i+1}:")
    print(f"Title: {original_title}")
    print(f"Selftext: {original_text}")
    print(f"Predicted Subreddit: {predicted_label}")
    print(f"Actual Subreddit: {actual_label}")
    print("-" * 60)


Instance 1:
Title: Who Embarrassed Michael Jordan The Worst Shaq or Allen Iverson
Selftext: 
Predicted Subreddit: KobeBryant24
Actual Subreddit: KobeBryant24
------------------------------------------------------------
Instance 2:
Title: The coin flip that changed the fate of Magic Johnson and Michael Jordan - Epicbuzzer
Selftext: 
Predicted Subreddit: KobeBryant24
Actual Subreddit: KobeBryant24
------------------------------------------------------------
Instance 3:
Title: During timeout, LeBron James working a bit on his footwork in the post. LeBron also instructing Max Christie on various spin moves
Selftext: 
Predicted Subreddit: michaeljordan
Actual Subreddit: michaeljordan
------------------------------------------------------------
Instance 4:
Title: Eigener edit
Selftext: 
Predicted Subreddit: lebron
Actual Subreddit: lebron
------------------------------------------------------------
Instance 5:
Title: Im Michael Now
Selftext: 
Predicted Subreddit: lebron
Actual Subreddit: leb

In [250]:
# Ensure you're using the correct vectorizer instance that was fit on the training data
feature_names = tfidf.get_feature_names_out()

# Assuming logistic_model is your trained model
num_classes = logistic_model.coef_.shape[0]
top_features = 10

for class_index in range(num_classes):
    coefficients = logistic_model.coef_[class_index]
    # Ensure we only consider as many features as we have names for
    top_indices = np.argsort(np.abs(coefficients))[-top_features:]
    
    print(f"Class {class_index}: Most influential words")
    for index in top_indices:
        # Safeguard against out-of-bounds access
        if index < len(feature_names):
            print(f" {feature_names[index]} (Coefficient: {coefficients[index]:.4f})")
        else:
            print(" Index out of bounds, skipped.")
    print("-" * 40)


Class 0: Most influential words
 slowly (Coefficient: 0.9162)
 braves (Coefficient: -1.0524)
 clear (Coefficient: -1.1637)
 chng (Coefficient: -1.3002)
 championships (Coefficient: 1.4877)
 broke (Coefficient: -1.5943)
 scoring (Coefficient: 1.6822)
 86 (Coefficient: 1.6947)
 came (Coefficient: -2.0552)
 business (Coefficient: 4.3713)
----------------------------------------
Class 1: Most influential words
 fails (Coefficient: 0.8564)
 send (Coefficient: 0.9144)
 86 (Coefficient: -1.0151)
 chng (Coefficient: -1.0944)
 bucket (Coefficient: 1.2493)
 clear (Coefficient: -1.4564)
 broke (Coefficient: -1.8793)
 braves (Coefficient: 2.1718)
 business (Coefficient: -2.4146)
 came (Coefficient: 3.5274)
----------------------------------------
Class 2: Most influential words
 bag (Coefficient: 0.7625)
 dm (Coefficient: 0.7654)
 fails (Coefficient: -0.8226)
 scoring (Coefficient: -0.8343)
 braves (Coefficient: -1.1194)
 came (Coefficient: -1.4721)
 business (Coefficient: -1.9568)
 chng (Coeffici

Now Trying With Neural Net

In [251]:
import torch
from scipy.sparse import csr_matrix
from torch.utils.data import TensorDataset, DataLoader

# Function to convert sparse matrix to tensor
def sparse_to_tensor(sparse_matrix):
    sparse_matrix = sparse_matrix.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_matrix.row, sparse_matrix.col)).astype(np.int64)
    )
    values = torch.from_numpy(sparse_matrix.data)
    shape = torch.Size(sparse_matrix.shape)
    return torch.sparse_coo_tensor(indices, values, shape)

# Convert X and y to PyTorch tensors
X_train_tensor = sparse_to_tensor(X_train)
X_test_tensor = sparse_to_tensor(X_test)
y_train_tensor = torch.from_numpy(y_train.astype(np.float32))
y_test_tensor = torch.from_numpy(y_test.astype(np.float32))

# Create TensorDatasets and DataLoaders
train_dataset = TensorDataset(X_train_tensor.to_dense(), y_train_tensor)
test_dataset = TensorDataset(X_test_tensor.to_dense(), y_test_tensor)

batch_size = 64  # Adjust based on your computational resources
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [252]:
import torch.nn as nn
import torch.nn.functional as F

class TextClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, output_dim)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.softmax(x, dim=1)  # Softmax applied at the output layer

# Determine input and output dimensions
input_dim = X_train.shape[1]
output_dim = y_train.shape[1]  # Number of classes

# Initialize the model
model = TextClassifier(input_dim, output_dim)


In [253]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Training loop
epochs = 10  # Number of epochs to train for
for epoch in range(epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        predictions = model(X_batch)
        loss = criterion(predictions, torch.max(y_batch, 1)[1])  # Assuming y_batch is one-hot encoded
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")


Epoch 1, Loss: 1.092287540435791
Epoch 2, Loss: 1.0435715913772583
Epoch 3, Loss: 0.9451636672019958
Epoch 4, Loss: 0.7866892218589783
Epoch 5, Loss: 0.6858813762664795
Epoch 6, Loss: 0.5712226629257202
Epoch 7, Loss: 0.649848997592926
Epoch 8, Loss: 0.5653098821640015
Epoch 9, Loss: 0.618776798248291
Epoch 10, Loss: 0.5668134689331055


In [254]:
# Evaluate the model
model.eval()  # Set the model to evaluation mode

# Tracking variables
correct_predictions = 0
total_predictions = 0

# No gradient updates needed for evaluation
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        predictions = model(X_batch)
        _, predicted_labels = torch.max(predictions, 1)
        _, actual_labels = torch.max(y_batch, 1)
        
        total_predictions += y_batch.size(0)
        correct_predictions += (predicted_labels == actual_labels).sum().item()

accuracy = correct_predictions / total_predictions
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 74.83%


KNN classifier, performed the worst at the moment 

In [255]:
import numpy as np

# Assuming y_train and y_test are one-hot encoded, convert them back to label-encoded format
y_train_labels = np.argmax(y_train, axis=1)
y_test_labels = np.argmax(y_test, axis=1)
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors

# Fit the model to the training data
# Note: For high-dimensional sparse data, converting to dense might be memory-intensive
# Consider using TruncatedSVD or similar techniques to reduce dimensionality if necessary
knn.fit(X_train, y_train_labels)
from sklearn.metrics import accuracy_score

# Predict the labels for the test set
y_pred = knn.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test_labels, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 43.71%


Naive Bayes

In [256]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Initialize the Multinomial Naive Bayes classifier
mnb = MultinomialNB(alpha=4)

# Fit the classifier to the training data
# Note: MultinomialNB works with sparse matrices, so there's no need to convert them to dense
mnb.fit(X_train, y_train_labels)
# Predict the labels for the test set
y_pred = mnb.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test_labels, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 80.79%


KNN, Logistic Regression, and Neural Net All performed similarily. Naive Bayes performed the best.

In [257]:
feature_names = tfidf.get_feature_names_out()
log_probabilities = mnb.feature_log_prob_


In [258]:
def print_top_words_per_class(classifier, vectorizer, class_labels, num_top_words=10):
    """
    Prints the top words that are most indicative of each class according to the classifier,
    along with the actual class names.

    Parameters:
    - classifier: A trained Multinomial Naive Bayes classifier.
    - vectorizer: The vectorizer used to transform the text data.
    - class_labels: A list of class names corresponding to the classifier's classes.
    - num_top_words: The number of top words to display for each class.
    """
    feature_names = vectorizer.get_feature_names_out()
    log_probabilities = classifier.feature_log_prob_
    
    for i, class_log_prob in enumerate(log_probabilities):
        class_name = class_labels[i]  # Use the actual class name
        print(f"{class_name}: Most influential words")
        top_indices = class_log_prob.argsort()[-num_top_words:]
        
        for index in reversed(top_indices):  # Print from highest to lowest
            if index < len(feature_names):  # Check index to ensure within bounds
                print(f"  {feature_names[index]}: {class_log_prob[index]:.4f}")
            else:
                print(" Index out of bounds, skipped.")
        print("-" * 40)

# To get the class labels (subreddit names) in the correct order
class_labels = list(pd.get_dummies(big_df["subreddit"]).columns)

# Example usage
print_top_words_per_class(mnb, tfidf, class_labels, num_top_words=1)


KobeBryant24: Most influential words
  business: -5.3440
----------------------------------------
lebron: Most influential words
  came: -5.4222
----------------------------------------
michaeljordan: Most influential words
  broke: -5.6316
----------------------------------------


In [262]:
def input_sentence_to_prediction(text_sentence, model):
    pass

In [266]:
def input_sentence_to_prediction(model, tfidf_title, tfidf_selftext, text_title, text_sentence):
    """
    Converts text input into a prediction using the specified model and vectorizers.

    Parameters:
    - model: The trained classification model.
    - tfidf_title: The TF-IDF vectorizer used during training for titles.
    - tfidf_selftext: The TF-IDF vectorizer used during training for selftext.
    - text_title: The input text title to predict.
    - text_sentence: The input text sentence (selftext) to predict.
    
    Returns:
    - The predicted class for the input text.
    """
    # Vectorize the input text title and text sentence using the corresponding TF-IDF vectorizers
    title_vec = tfidf_title.transform([text_title])
    sentence_vec = tfidf_selftext.transform([text_sentence])
    
    # Combine the vectors to match the training data structure
    combined_input_vec = hstack([title_vec, sentence_vec])
    
    # Predict the class for the combined input vector
    prediction = model.predict(combined_input_vec)
    
    return prediction


In [297]:
model = mnb  # Your trained MultinomialNB model

text_title = "Things "  # Example text for the title
text_sentence = "Lebron Lebron kobe kobe lebron kobe" 
prediction = input_sentence_to_prediction(model, tfidf_title, tfidf_selftext, text_title, text_sentence)

# Assuming your model outputs numerical class labels, you might want to map these back to class names.
# If `class_labels` is a list of class names in the order they were encoded:
predicted_class = class_labels[prediction[0]]  # prediction[0] because model.predict returns an array of predictions
print("Predicted class:", predicted_class)

Predicted class: KobeBryant24
