### Imports

In [36]:
import psycopg2
import pandas as pd 
import numpy as np 
from datetime import datetime, timedelta
import asyncio
from collections import Counter

import sys
sys.path.append("/Users/shriramsunder/Projects/WSB/")

from wsb import Gather
from prefect_utils import extract_submissions
from prefect_utils import get_submission_search_urls

from airflow_utils import update_submissions_praw
from airflow_utils import update_comments_praw 

from utils import add_time_series_slider

from tqdm.auto import tqdm
import plotly.graph_objects as go

import matplotlib.pyplot as plt 
%matplotlib inline

import nest_asyncio
nest_asyncio.apply()

import logging
import praw
from psaw import PushshiftAPI
tqdm.pandas()

import swifter

handler = logging.StreamHandler()
logger = logging.getLogger('psaw')
logger.setLevel(logging.ERROR)
logger.addHandler(handler)

### Plots

In [2]:
gather_wsb = Gather()

In [3]:
submissions_status_df = gather_wsb.get_submission_status_mat_view()
submissions_status_df["date"] = pd.to_datetime(submissions_status_df["date"])
submissions_status_df = submissions_status_df.set_index("date")
submissions_status_df = submissions_status_df.sort_index()

In [20]:
fig = go.Figure([go.Scatter(x=submissions_status_df.index, y=submissions_status_df["submissions_count"])])
fig = add_time_series_slider(fig=fig)
fig.show()

### Submissions analysis

In [37]:
sys.path.append("/Users/shriramsunder/Projects/Waves - All TimeSeries Related/")

from polygonio_api_calls import get_all_tickers, generate_urls, download_all

In [32]:
query = "SELECT created_utc, author, id, title, selftext FROM submissions;"
submissions_df = pd.read_sql_query(sql=query, con=gather_wsb.get_sqlalchemy_engine(local=True))
submissions_df["date"] = pd.to_datetime(submissions_df["created_utc"], unit="s")
submissions_df = submissions_df.drop(columns=["created_utc"])
submissions_df = submissions_df.set_index("date").sort_index(ascending=True)

In [67]:
### Who submits the most per day/month/week/year besides the auto bots
### How many submissions have their author deleted per day ? And in total ?
### How many submissions have their selftext deleted per day ? And in total ?
### How many of these submissions contain tickers in them? What tickers?

In [122]:
# get all tickers 
tickers = get_all_tickers(active=True, limit=1000, only_tickers=True)
currencies = [x for x in tickers if ("X:" in x) or ("C:" in x)]
stocks = sorted(list(set(tickers) - set(currencies)))

# ignore these, 
exceptions = ["A", "BUT", "CAN", "CANT", "SO", "MAKE", 
              "THEN", "WHY", "WHO", "E", "I", "O", "U", 
              "YOU", "THEY", "PUT", "TO", "AN", "BOT", 
              "FOR", "JAN", "FEB", "MAR", "APR", "MAY", 
              "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", 
              "DEC", "BY", "UP", "IS", "IT", "AUTO", 
              "KIDS", "GET", "JOBS", "BIG", "SAVE", 
              "JUST", "PAY", "WANT", "OUT", "LIFE", "SAVE",
              "NEW", "LOAN", "BE", "FUN", "HE", "NOW", "COOL", "ON", 
              "ARE", "STAR", "FREE", "OR", "GOOD", "GO", "WE", "FORM", "FROM",
              "WELL", "TEAM", "LOVE", "TECH", "PLAY", "USER", "ALL", "ONE", "MOVE",
             ]

proper_tickers = [ele for ele in tickers if ele not in exceptions]

INFO    2021-12-06 21:01:01,801 calls:get_all_tickers:39 : -- Fetching all tickers ...


INFO:WavesLogger:-- Fetching all tickers ...


In [107]:
if "count_tickers" in locals(): 
    del count_tickers
    
def count_tickers(proper_tickers:list, title: str) -> int: 
    split_sentence = title.upper().split()
    all_tickers_found = dict(Counter([i for i in proper_tickers if i in split_sentence]))  
    return all_tickers_found if len(all_tickers_found) > 0 else None

In [123]:
submissions_df["ticker_count"] = submissions_df["title"].swifter.apply(lambda x: count_tickers(proper_tickers=proper_tickers, title=x))
submissions_df = submissions_df.fillna(value=np.NaN)


This pandas object has duplicate indices, and swifter may not be able to improve performance. Consider resetting the indices with `df.reset_index(drop=True)`.



Pandas Apply:   0%|          | 0/152236 [00:00<?, ?it/s]

In [124]:
submissions_df.loc[
    (~submissions_df.ticker_count.isna()), ["id", "ticker_count"]
]

Unnamed: 0_level_0,id,ticker_count
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-06-04 21:34:58,ukwhl,{'VS': 1}
2012-06-22 00:14:21,verkm,{'VXX': 1}
2013-07-11 11:03:31,1i2ndg,{'RMTI': 1}
2013-11-06 18:22:12,1q1gyl,{'HIMX': 1}
2013-12-23 17:11:22,1tjgcx,"{'DDD': 1, 'ONCS': 1}"
...,...,...
2021-12-06 15:22:25,ra9ojr,{'NVDA': 1}
2021-12-06 15:23:50,ra9pp8,{'QQQ': 1}
2021-12-06 15:33:11,ra9xff,{'BABA': 1}
2021-12-06 15:39:47,raa2ri,{'BABA': 1}


In [None]:
submissions_df.groupby(submissions_df.index.dt.date).apply(lambda x: )