In [3]:
# import libraries
import requests
import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

nba_url = 'https://www.reddit.com/r/nba.json'
lebron_url = 'https://www.reddit.com/r/lebron.json'
micheal_url = 'https://www.reddit.com/r/michaeljordan.json'
kobe_url = "https://www.reddit.com/r/KobeBryant24.json"
header = {'User-agent': 'subreddit get requests'}

In [4]:
# define function to get num pages of posts from a subreddit, start collecting at a defined after
def reddit_scraper(url, num, after = None):
    posts = []
    # loop through the num pages, each subreddit .json returns 25 posts 
    for page in range(num):
        # initiate params modifier for posts if there no defined after
        if after == None:
            params = {}
        # add in after id for each loop following to ensure no duplicate posts
        else:
            params = {'after': after}
        # call our get request for the posts
        res = requests.get(url, params=params, headers=header)
        # check status code, 200 means posts were successfully downloaded
        if res.status_code == 200:
            # convert request to .json
            new_json = res.json()
            # extend list from the 'children' dictionary for each request
            posts.extend(new_json['data']['children'])
            # update after id
            after = new_json['data']['after']
        else:
            # print status code if not 200
            print(res.status_code)
            break
        # wait 1 second
        time.sleep(1)
        
    # create a new dataframe with the 'data' from each post
    new_df = pd.DataFrame([post['data'] for post in posts])
    
    # print final value of after
    print(f'Final value of after parameter: {after}')
    
    # return the dataframe
    return new_df

In [5]:
lebron_df = reddit_scraper(lebron_url, 10)
jordan_df = reddit_scraper(micheal_url, 10)
kobe_df = reddit_scraper(kobe_url, 10)

Final value of after parameter: t3_zz7k1n
Final value of after parameter: t3_4uvx6f
Final value of after parameter: t3_13m4sin


In [6]:
def extract_features(df):
    df = df[['selftext', 'title', 'subreddit']]
    return df

In [7]:
df_list = [extract_features(lebron_df), extract_features(jordan_df), extract_features(kobe_df)]

In [8]:
big_df = pd.concat(df_list, ignore_index=True)

In [9]:
big_df = big_df.dropna(subset=['selftext', 'title'], how='all')

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize two separate vectorizers
tfidf_title = TfidfVectorizer(max_df=0.95, min_df=2, max_features=5000)
tfidf_selftext = TfidfVectorizer(max_df=0.95, min_df=2, max_features=5000)
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, max_features=5000)

# Fit and transform separately
title_vecs = tfidf_title.fit_transform(big_df['title'])
selftext_vecs = tfidf_selftext.fit_transform(big_df['selftext'])

# Combine the vectors
X = hstack([title_vecs, selftext_vecs])
y = pd.get_dummies(big_df["subreddit"]).values

In [14]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(big_df, test_size=0.1)


In [20]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)


In [21]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "bert-base-uncased"
num_labels = len(train_df['subreddit'].unique())

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(model_name)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
print(train_dataset.column_names)

['selftext', 'title', 'subreddit', '__index_level_0__']


In [23]:
def tokenize_function(examples):
    return tokenizer(examples["selftext"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/676 [00:00<?, ? examples/s]

Map:   0%|          | 0/76 [00:00<?, ? examples/s]

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    evaluate_during_training=True,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset
)

trainer.train()


In [None]:
trainer.evaluate()


In [1]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertModel.from_pretrained("bert-base-cased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [4]:
import pandas as pd

In [12]:
df = pd.read_csv("Kobe_Jordan_Lebron.csv")

In [13]:
df

Unnamed: 0,selftext,title,subreddit,combined_text
0,\n,LEBRON ON SACRIFICES YOU HAVE TO MAKE TO BE TH...,lebron,LEBRON ON SACRIFICES YOU HAVE TO MAKE TO BE TH...
1,,LeBron poster I designed! Instagram is @csc.dznüí´,lebron,LeBron poster I designed! Instagram is @csc.dznüí´
2,,Where can I buy this?,lebron,Where can I buy this?
3,"As you all know, king LBJ dominated the game ...",lebron legacy question,lebron,"lebron legacy question As you all know, king ..."
4,,what if lebron went to the bulls?,lebron,what if lebron went to the bulls?
...,...,...,...,...
747,,This is pretty wild!,KobeBryant24,This is pretty wild!
748,So I have a picture of me when I was a year an...,Looking for some help!!,KobeBryant24,Looking for some help!! So I have a picture of...
749,,Kobe‚Äôs 8 Passengers,KobeBryant24,Kobe‚Äôs 8 Passengers
750,&amp;#x200B;\n\nhttps://preview.redd.it/n15rpi...,Made this [ig-@wassuppixel],KobeBryant24,Made this [ig-@wassuppixel] &amp;#x200B;\n\nht...


In [15]:
import pandas as pd
import torch

# Example DataFrame

# Placeholder for the output features
features = []

# Disable gradient calculation for inference
with torch.no_grad():
    for text in df['combined_text']:
        # Tokenize text and convert to tensors
        encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        # Get model output
        output = model(**encoded_input)
        
        # The `last_hidden_state` is the sequence of hidden-states at the output of the last layer
        # You might want to use `output.pooler_output` for getting the [CLS] token representation which is commonly used for classification tasks
        # Here, we're using the mean of the last hidden state as the feature representation
        feature = output.last_hidden_state.mean(dim=1)
        
        # Append the feature representation to our list
        features.append(feature)

# Convert the list of tensors to a single tensor
features_tensor = torch.cat(features, dim=0)

# At this point, `features_tensor` contains the feature representation for each text in the DataFrame


tensor([[-0.0565,  0.0135, -0.0037,  ...,  0.5759,  0.3810,  0.2654],
        [ 0.2761,  0.0664, -0.0598,  ...,  0.1589,  0.3966,  0.2621],
        [ 0.1405,  0.0747,  0.0179,  ...,  0.0734,  0.2343,  0.0433],
        ...,
        [ 0.3011, -0.2029, -0.2811,  ...,  0.1102,  0.1982, -0.3990],
        [ 0.1808, -0.0697,  0.1216,  ...,  0.1223,  0.1646,  0.2742],
        [ 0.2529, -0.2798,  0.1280,  ...,  0.2531,  0.3956,  0.6909]])

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader

# Load the dataset (assuming it's already loaded in df)
# df = pd.read_csv("your_dataset.csv")

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization and encoding the dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Assuming 'subreddit' is categorical and needs to be converted to integers
label_dict = {label: index for index, label in enumerate(df['subreddit'].unique())}
df['label'] = df['subreddit'].map(label_dict)

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(df['combined_text'].values, df['label'].values, test_size=0.1)

# Create the dataset
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [18]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_dict)  # Number of output labels
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
from transformers import AdamW
from torch import nn
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch}, Loss: {loss.item()}")

    # Evaluation step could be added here




AttributeError: 'list' object has no attribute 'to'

In [22]:

model.train()
for batch in train_loader:
    # Ensure batch is a dictionary
    if not isinstance(batch, dict):
        raise ValueError("Expected batch to be a dictionary.")

    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    # Clear any previously calculated gradients
    optimizer.zero_grad()

    # Forward pass
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

    # Backward pass and optimize
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    
    print(f"Current Loss: {loss.item()}")


Current Loss: 1.2010806798934937
Current Loss: 1.1881211996078491
Current Loss: 1.2479181289672852
Current Loss: 1.1865103244781494
Current Loss: 1.1670149564743042
Current Loss: 1.0979158878326416
Current Loss: 1.1490139961242676
Current Loss: 1.124993085861206
Current Loss: 1.0458067655563354
Current Loss: 1.129644513130188
Current Loss: 1.0861319303512573
Current Loss: 1.1049641370773315
Current Loss: 1.172973394393921
Current Loss: 1.0778130292892456
Current Loss: 1.085856318473816
Current Loss: 1.112165927886963
Current Loss: 1.085579752922058
Current Loss: 1.1484427452087402
Current Loss: 1.1397459506988525
Current Loss: 1.0893964767456055
Current Loss: 1.087742567062378
Current Loss: 1.109257698059082
Current Loss: 1.0647058486938477
Current Loss: 1.058986783027649
Current Loss: 1.124523401260376
Current Loss: 1.0965850353240967
Current Loss: 1.044784426689148
Current Loss: 1.0558671951293945
Current Loss: 1.0312113761901855
Current Loss: 1.0537627935409546
Current Loss: 1.09706