In [None]:
import pandas as pd
import numpy as np
import requests 
import collections
import os
from datetime import datetime

# API + Model Functions 

## Todo: 
* utlize different models to improve sentiment analysis 


In [None]:
API_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment"
headers = {"Authorization": "Bearer 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"}

#API Function 
def roberta(payload):

	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

#Runs API and retrieves max sentiment + label 

def get_sentiment(string): 
    if string:
        label_dict = {'LABEL_0': "Negative", "LABEL_1": "Neutral", "LABEL_2": "Positive"} 
        query = {"inputs": string}
        
        #output is in a nested list
        print(query)
        output = roberta(query) 
        print(output)
        sentiments = output[0] 
        #returns the label with the max score!!!
        max_label = max(sentiments, key = lambda x: x['score'])['score']

        #return label_dict[max_label]
        return max_label
    return 

vectorized_sentiment = np.vectorize(get_sentiment) 


In [None]:
get_sentiment("hi")

{'inputs': 'hi'}
[[{'label': 'LABEL_0', 'score': 0.20451165735721588}, {'label': 'LABEL_1', 'score': 0.5521488189697266}, {'label': 'LABEL_2', 'score': 0.24333952367305756}]]


0.5521488189697266

In [None]:
#runs the sentiment in batches 
#bins format - [start index, end index]
def free_sentiment(data, bins):
    sentiment = []
    for i in range(len(bins)): 
        if i < len(bins) - 1:  
            cut_data = data[bins[i] : bins[i + 1]]
            text = cut_data['full_text'] 
            try: 
                output = text.apply(get_sentiment) 
                sentiment.extend(output.values)
            #error when you run out free runs     
            except KeyError: 
                print(f"indexes:{bins[i]}:{bins[i + 1]} did not run")
    return sentiment
            

In [None]:
#drop na values, outputs analyziable output 
def process_data(path, text_column): 
    
    data = pd.read_csv(path).dropna(subset = [text_column])
    #reindex dropped itemsb
    data = data.reset_index(drop=True)
    new = data[[text_column]].copy()
    return new


In [None]:
data = process_data("/work/stocktweets 03-08-2022 045358.csv", "text")
data.head()

Unnamed: 0,text
0,@HODL_247 That looks like a stress test for me...
1,@lilgurtec @NocbroNation @WalkingDead_AMC @AMC...
2,$qqq $tsla stay Sucker Free ü§∑ü§∑‚Äç‚ôÇÔ∏èü§∑üòÜüòÜüòÜ ‚û° https:...
3,@POTATO_INMYASS so can someone confirm did AMC...
4,"@AMCApe304 80%...plus 15% gme, 2% koss...diver..."


In [None]:
# DO NOT RUN 
first_batch = free_sentiment(data, [0, 1001])

In [None]:
test['huggingFace_sentiment'] = first_batch

In [None]:
test.to_csv("first_batch.csv")

# Data Processing

### Reddit Data Processing

In [None]:
def redditProcess(path): 
    #do not need to preprocess text since they can't be null
    data = pd.read_csv(path) 
    values = pd.unique(data["title"])

    #using most frequenst sentiment
    #change later
    sentiment = vectorized_sentiment(values)
    counter = collections.Counter(sentiment) 

   
    most_common = counter.most_common()[0][0]

    return most_common

vectorized_redditProcess = np.vectorize(redditProcess)

In [None]:
#figure out better way to do this 
stocks = ["AMC", "GME", "HOOD", "TLRY", "TSLA"]

In [None]:
def list_full_paths(directory):
    return [os.path.join(directory, file) for file in os.listdir(directory)]
 
dirs = list_full_paths("/work/Files/Reddit Files")
dirs

['/work/Files/Reddit Files/rposts_amc_3_29_2022.csv',
 '/work/Files/Reddit Files/rposts_gme_3_29_2022.csv',
 '/work/Files/Reddit Files/rposts_tsla_3_29_2022.csv',
 '/work/Files/Reddit Files/rposts_hood_3_29_2022.csv',
 '/work/Files/Reddit Files/rposts_tlry_3_29_2022.csv']

In [None]:
def reddit_sentiment(path, stock_names, date_string): 
    
    dirs = list_full_paths(path) 
    dates = np.full(shape = (len(dirs), ), fill_value = date_string)
    sentiment = vectorized_redditProcess(dirs)

    dict = {"stock": stock_names, "reddit_sentiment" : sentiment, "date": dates}
    output = pd.DataFrame(dict) 

    return output


In [None]:
stocks = ["AMC", "GME", "HOOD", "TLRY", "TSLA"]
reddit = reddit_sentiment("/work/Files/Reddit Files", stocks, "03/29/2022")

In [None]:
reddit.to_csv("reddit_sentiment.csv")

In [None]:
reddit

Unnamed: 0,stock,reddit_sentiment,date
0,AMC,Neutral,03/29/2022
1,GME,Neutral,03/29/2022
2,HOOD,Neutral,03/29/2022
3,TLRY,Neutral,03/29/2022
4,TSLA,Negative,03/29/2022


### Twitter

In [None]:
def twitterProcess(path): 
    data = process_data(path, "full_text") 
    if len(data) > 500: 
        sampled = data.sample(500) 
        vals = sampled["full_text"].values
    else: 
        vals = data['full_text'].values 

    #applies sentiment on val values
    sentiment = vectorized_sentiment(vals)
    
    #most frequent
    counter = collections.Counter(sentiment) 
    most_common = counter.most_common()[0][0]

    return most_common


vectorized_twitterProcess = np.vectorize(twitterProcess)

In [None]:
dirs = list_full_paths("/work/Files/Twitter Files")
dirs

['/work/Files/Twitter Files/tweepy 03-28-2022 TSLA.csv',
 '/work/Files/Twitter Files/tweepy 03-28-2022 GME.csv',
 '/work/Files/Twitter Files/tweepy 03-28-2022 TLRY.csv',
 '/work/Files/Twitter Files/tweepy 03-28-2022 HOOD.csv',
 '/work/Files/Twitter Files/tweepy 03-28-2022 AMC.csv']

In [None]:
def twitterSentiment(path, stock_names, date_string):
    dirs = list_full_paths(path) 
    dates = np.full(shape = (len(dirs), ), fill_value = date_string)
    
    sentiment = vectorized_twitterProcess(dirs) 
    dict = {"stock": stock_names, "twitter_sentiment" : sentiment, "date": dates}
    output = pd.DataFrame(dict) 

    return output


In [None]:
stocks = ["AMC", "GME", "HOOD", "TLRY", "TSLA"] 
twitter_sentiment = twitterSentiment("/work/Files/Twitter Files", stocks, "03/28/2022")

[[{'label': 'LABEL_0', 'score': 0.8063413500785828}, {'label': 'LABEL_1', 'score': 0.18046934902668}, {'label': 'LABEL_2', 'score': 0.01318931020796299}]]
{'inputs': 'More strange deals going on. #amc #amcshortsqueeze $amc #AMCNOTLEAVING https://t.co/M2XSJkgfeQ'}
[[{'label': 'LABEL_0', 'score': 0.4970993399620056}, {'label': 'LABEL_1', 'score': 0.4367676377296448}, {'label': 'LABEL_2', 'score': 0.06613298505544662}]]
{'inputs': "@BigSipInc @mongollidtrade I look at it this way, prog is in my investment portfolio so short term doesn't matter too much. I am playing this and a couple others very long, like $sens.\n It wasn't that earnings were bad they just didn't announce partnerships. $amc $hymc today though!"}
[[{'label': 'LABEL_0', 'score': 0.07010607421398163}, {'label': 'LABEL_1', 'score': 0.39509662985801697}, {'label': 'LABEL_2', 'score': 0.5347973108291626}]]
{'inputs': '@MrZackMorris Told you 5 days ago that $AMC was about to pop, and would you look at that, +40% today. Send it 

In [None]:
twitter_sentiment

Unnamed: 0,stock,twitter_sentiment,date
0,AMC,Neutral,03/28/2022
1,GME,Neutral,03/28/2022
2,HOOD,Neutral,03/28/2022
3,TLRY,Neutral,03/28/2022
4,TSLA,Neutral,03/28/2022


In [None]:
twitter_sentiment.to_csv("twitter_sentiment.csv")

### combining reddit and twitter data


In [None]:
twitter = pd.read_csv("twitter_sentiment.csv") 
reddit = pd.read_csv("reddit_sentiment.csv")

In [None]:
reddit

Unnamed: 0.1,Unnamed: 0,stock,reddit_sentiment,date
0,0,AMC,Neutral,03/29/2022
1,1,GME,Neutral,03/29/2022
2,2,HOOD,Neutral,03/29/2022
3,3,TLRY,Neutral,03/29/2022
4,4,TSLA,Negative,03/29/2022


In [None]:
twitter

Unnamed: 0.1,Unnamed: 0,stock,twitter_sentiment,date
0,0,AMC,Neutral,03/28/2022
1,1,GME,Neutral,03/28/2022
2,2,HOOD,Neutral,03/28/2022
3,3,TLRY,Neutral,03/28/2022
4,4,TSLA,Neutral,03/28/2022


In [None]:
final = pd.concat([twitter, reddit])[["stock", "date", "twitter_sentiment", "reddit_sentiment"]].fillna("")
final

Unnamed: 0,stock,date,twitter_sentiment,reddit_sentiment
0,AMC,03/28/2022,Neutral,
1,GME,03/28/2022,Neutral,
2,HOOD,03/28/2022,Neutral,
3,TLRY,03/28/2022,Neutral,
4,TSLA,03/28/2022,Neutral,
0,AMC,03/29/2022,,Neutral
1,GME,03/29/2022,,Neutral
2,HOOD,03/29/2022,,Neutral
3,TLRY,03/29/2022,,Neutral
4,TSLA,03/29/2022,,Negative


In [None]:
final["total_sentiment"] = final["twitter_sentiment"] + final["reddit_sentiment"]
    

In [None]:
final

Unnamed: 0,stock,date,twitter_sentiment,reddit_sentiment,total_sentiment
0,AMC,03/28/2022,Neutral,,Neutral
1,GME,03/28/2022,Neutral,,Neutral
2,HOOD,03/28/2022,Neutral,,Neutral
3,TLRY,03/28/2022,Neutral,,Neutral
4,TSLA,03/28/2022,Neutral,,Neutral
0,AMC,03/29/2022,,Neutral,Neutral
1,GME,03/29/2022,,Neutral,Neutral
2,HOOD,03/29/2022,,Neutral,Neutral
3,TLRY,03/29/2022,,Neutral,Neutral
4,TSLA,03/29/2022,,Negative,Negative


In [None]:
final.to_csv("TwitterReddit_sentiment.csv")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=643f0a0a-e649-4860-b73b-f3561d8b41c9' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>