In [1]:
import praw
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
import pandas as pd
import datetime

import secrets

## Set up Reddit API object

In [3]:
SUBREDDITS = "wallstreetbets"

reddit = praw.Reddit(
    client_id=secrets.REDDIT_API_CLIENT_ID,
    client_secret=secrets.REDDIT_API_CLIENT_SECRET,
    user_agent=secrets.REDDIT_API_USER_AGENT
)
subreddit = reddit.subreddit(SUBREDDITS)

In [11]:
subreddit

Subreddit(display_name='wallstreetbets+investing+options+pennystocks+options+stocks')

## Get top 1000 submissions from Wallstreetbets

In [5]:
top_submissions = []
for submission in subreddit.top(limit=1000):
    top_submissions.append(submission)

print(len(top_submissions))

975


## Iterate over submissions to parse and fetch 10000 comments and replies in those threads

In [None]:
from praw.models import MoreComments

NUM_COMMENTS = 100000
comments = 0
conversations = []

for submission in top_submissions:
    for top_level_comment in submission.comments:
        convo = []
        if isinstance(top_level_comment, MoreComments):
            continue
        convo.append(top_level_comment.body)
        for reply in top_level_comment.replies:
            if isinstance(reply, MoreComments):
                continue
            convo.append(reply.body)
        
        conversations.append(convo)
        
        comments = comments + 1
        print(comments)
    if comments > NUM_COMMENTS:
        break

## Preprocess conversations by turning text to lowercase and adding special spacing as per documentation

In [43]:
preprocessed_conversations = []

for convo in conversations:
    the_convo = []
    for comment in convo:
        c = comment.lower().replace(".", " .").replace("?", " ?").replace("!", " !")
        the_convo.append(c)
    preprocessed_conversations.append(the_convo)

## Flatten conversations so that a pool of all comments is available during training

In [44]:
flat_convos = []

for convo in preprocessed_conversations:
    for comment in convo:
        flat_convos.append(comment)

## Time to train!

In [45]:
import random

# Takes predefined number of comments and returns from flat convos so that random comments can be fed to the model during training
def create_random_candidates():
    candidates = []
    for _ in range(3):
        candidates.append(random.choice(flat_convos))
    return candidates

In [47]:
def get_length_of_utterances_by_words(utterances):
    total_words = 0
    for obj in utterances:
        for candidate in obj["candidates"]:
            num_words = len(candidate.split(" "))
            total_words = total_words + num_words
        for history in obj["history"]:
            num_words = len(history.split(" "))
            total_words = total_words + num_words
    
    return total_words

def get_length_of_utterances_by_char(utterances):
    total_words = 0
    for obj in utterances:
        for candidate in obj["candidates"]:
            num_words = len(candidate)
            total_words = total_words + num_words
        for history in obj["history"]:
            num_words = len(history)
            total_words = total_words + num_words
    
    return total_words

def get_length_of_personality_by_words(personality):
    total_words = 0
    for p in personality:
        total_words = total_words + len(p.split(" "))
    
    return total_words

def get_length_of_personality_by_char(personality):
    total_words = 0
    for p in personality:
        total_words = total_words + len(p)
    
    return total_words

In [58]:
import copy

MAX_TENSOR_SIZE = 350

personality = [
    "i like stocks",
    "i am from wallstreetbets",
    "i like reddit",
    "i love holding the stock"
]

# Training data will have a specific format as specified in this article: https://towardsdatascience.com/how-to-train-your-chatbot-with-simple-transformers-da25160859f4
training_data = []

for convo in preprocessed_conversations:
    the_convo = {}
    utterances = []
    convo_len = len(convo)
    history = []
    total_words = 0
    if convo_len > 1:
        for i, comment in enumerate(convo):
            convo_obj = {}
            if i % 2 == 0:
                if convo_len >= i+2:
                    len_words = len(comment) + len(convo[i+1])
                    total_words = total_words + len_words
                    if total_words > MAX_TENSOR_SIZE:
                        continue # Just skip for now
                    else:
                        history.append(comment)
                        convo_obj["candidates"] = create_random_candidates()
                        convo_obj["candidates"].append(convo[i+1])
                        convo_obj["history"] = copy.deepcopy(history)
                        utterances.append(convo_obj)
                        total_words = get_length_of_utterances_by_char(utterances) + get_length_of_personality_by_char(personality)
                        if len(utterances) > 0 and total_words < MAX_TENSOR_SIZE:
                            the_convo["personality"] = personality
                            the_convo["utterances"] = utterances
                            training_data.append(the_convo)

len(training_data)

4688

In [59]:
import json

with open("data/minimal_train.json", "w") as f:
    json.dump(training_data, f)

In [17]:
from simpletransformers.conv_ai import ConvAIModel

train_args = {
    "overwrite_output_dir": True,
    "reprocess_input_data": True,
    "use_early_stopping": True
}

model = ConvAIModel("gpt", "gpt_personachat_cache", use_cuda=True, args=train_args)

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


In [None]:
model.train_model("data/minimal_train.json")

In [62]:
model = ConvAIModel("gpt", "outputs")

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


In [86]:
personality = [
    "i like stocks",
    "i am from wallstreetbets",
    "i like reddit",
    "i love holding the stock"
]

response, history = model.interact_single(message="should I buy meme stocks?", history=[], personality=personality)
print(response)
response, history = model.interact_single(message="are you a millionaire?", history=history, personality=personality)
print(response)
response, history = model.interact_single(message="what stock will go to the moon?", history=history, personality=personality)
print(response)
response, history = model.interact_single(message="should I buy GME or AMC?", history=history, personality=personality)
print(response)

no, we need to be smart enough to know
no, but i can t help it
that s what i want to know
amc.
