In [None]:
#imports 
import pandas as pd
import numpy as np
import requests 
from time import sleep
from collections import Counter

# API Functions

In [None]:
headers = {"Authorization": "Bearer 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"}

# Model 1: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest

model1 = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment-latest"

# Model 2: https://huggingface.co/mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis 

# label dict not needed, output displays score + sentiment 
model2 = "https://api-inference.huggingface.co/models/mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"

# Model 3: https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english
model3 = "https://api-inference.huggingface.co/models/distilbert-base-uncased-finetuned-sst-2-english"


def get_sentiment(string, model, type = None):
    #string - text to run through model 
    #model - model url (reference above) 
    #output types: score, label 
    done = False
    
    headers = {"Authorization": "Bearer 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"}
    while not done:
        try: 
            #access model + obtain ouput
            payload = query = {"inputs": string}
            #print(payload)
            response = requests.post(model, headers = headers, json = query) 
            print(response.json())
            output = response.json()[0]

            best = max(output, key = lambda x: x['score'])
            label = best['label'].lower()
            score = np.round(best['score'], decimals = 3)
            done = True 
        except Exception as KeyError: 
            pass
            if KeyError:
                sleep(20)  
    
    #desired output
    if type == "score": 
        return score
    if type == "label": 
        return label

    return label, score
    

vec_sentiment = np.vectorize(get_sentiment)

# MAIN FUNCTION

In [None]:
raw_data = pd.read_csv("/work/Files/Twitter Files/twitter 04-06-2022.csv") 

In [None]:
def run_model(ticker_df):  
    
    models = [model1, model2, model3] 
    model_dict = {model1: 'model1', model2: 'model2', model3: 'model3'}
    mentions = len(ticker_df) 

    top20 = get_top20(ticker_df) 
    just20 = get_just20(ticker_df)

    ticker_df_filt = ticker_df[(ticker_df['favorite_count'] >= 2) | (ticker_df['retweet_count'] >= 1)].copy()
    
    if len(ticker_df_filt) > 150:
        data = ticker_df_filt.sample(150)

    elif len(ticker_df_filt) < 10: 
        cut = mentions //4 
        if cut < 50: 
            sample_size = min(cut, 50) 
        elif cut >= 50:
            sample_size = max(cut, 50)
        data = ticker_df.sample(sample_size)

    else:
        data = ticker_df_filt.copy()

    text = data['text'].values 
    
    for model in models: 
        label = model_dict[model] + "Sentiment" 
        result_label = model_dict[model] + "Score"  

        sentiment, score = vec_sentiment(text, model)
        data[label] = sentiment 
        data[result_label] = score 

    return mentions, data, top20, just20
    

In [None]:
#returns dictionary of dataframes for each stock ticker
def split_by_ticker(data): 
    stock_tickers = data['ticker'].unique() 

    DataFrameDict = {elem : pd.DataFrame for elem in stock_tickers}
    
    for key in DataFrameDict.keys():
        #process each dataframe 
        columns = {'full_text_preprocessed': 'text'}
        DataFrameDict[key] = data[data['ticker'] == key].copy().rename(columns = columns)
    
    return DataFrameDict

In [None]:
def get_top20(tweets): 
    
    data = tweets.sort_values(by = ['user.verified', 'retweet_count','favorite_count'], ascending=False).copy()
    text = data['text'].values[0:20]

    return text

In [None]:
def get_just20(tweets): 

    if len(tweets) > 20:
        text = tweets['text'].sample(n= 20,  replace=False).values
    
    else: 
        text = tweets['text'].values
 
    return text

In [None]:
def determine_majority(df): 
    data = df.copy() 
    data['Majority_Sentiment'] = [Counter([x,y,z]).most_common()[0][0] for x,y,z in zip(df['model1Sentiment'], df['model2Sentiment'], df['model3Sentiment'])]
    return data

In [None]:
def process_Sentiment(sentimment_Data): 
    neg_m1 = sentimment_Data[sentimment_Data['model1Sentiment'] == 'negative']['model1Score'].values 
    neg_m2 = sentimment_Data[sentimment_Data['model2Sentiment'] == 'negative']['model2Score'].values
    neg_m3 = sentimment_Data[sentimment_Data['model3Sentiment'] == 'negative']['model3Score'].values
    neg_arr = np.concatenate((neg_m1, neg_m2, neg_m3), axis = None)
    neg_avg = np.average(neg_arr)

    pos_m1 = sentimment_Data[sentimment_Data['model1Sentiment'] == 'positive']['model1Score'].values 
    pos_m2 = sentimment_Data[sentimment_Data['model2Sentiment'] == 'positive']['model2Score'].values
    pos_m3 = sentimment_Data[sentimment_Data['model3Sentiment'] == 'positive']['model3Score'].values
    pos_arr = np.concatenate((pos_m1, pos_m2, pos_m3), axis = None)
    pos_avg = np.average(pos_arr)

    total_vals = len(sentimment_Data) * 3 

    num_neg, num_pos = len(neg_arr), len(pos_arr)
    neg_weight, pos_weight = num_neg / total_vals, num_pos / total_vals
    weighted_neg, weighted_pos = neg_avg * neg_weight, pos_avg * pos_weight 

    if (num_neg / num_pos) > 0.75 and (num_neg / num_pos) < 1.25:
        sentimment_ratio = neg_avg / pos_avg
    else:
        sentimment_ratio = weighted_neg / weighted_pos

    final_sentimment = 'Negative'
    if sentimment_ratio == 1:
        final_sentimment = np.random.choice('Negative', 'Positive')
    elif sentimment_ratio < 1:
        final_sentimment = 'Positive'
    
    output_dict = {'Negative Average' : neg_avg, 'Positive Average' : pos_avg, 'Overall Sentiment' : final_sentimment}
    return output_dict

In [None]:
def process_Twitter(data): 
    #get data dic
    dataDic = split_by_ticker(data)
    # get tickers 


    tickers = list(dataDic.keys())
    mentions, negative_Avg, positive_Avg, overall = [], [], [], []
    top20, just20 = [],[]

    for ticker in tickers: 
        mention_count, stock, top20tweets, just20tweets = run_model(dataDic[ticker]) 
        results = process_Sentiment(stock)
        
        mentions.append(mention_count)
        negative_Avg.append(results['Negative Average'])
        positive_Avg.append(results['Positive Average'])
        overall.append(results['Overall Sentiment']) 
        top20.append(top20tweets)
        just20.append(just20tweets)

    
    output = pd.DataFrame({"Ticker": tickers,
                          "Mentions": mentions, 
                          "Negative Average": negative_Avg, 
                          "Positive Average": positive_Avg, 
                          "Overall Sentiment": overall,
                          "Top 20 Tweets": top20, 
                          "20 Quality Tweets": just20
                            })
    
    return output

In [None]:
output = process_Twitter(raw_data)

[[{'label': 'Negative', 'score': 0.7864415049552917}, {'label': 'Neutral', 'score': 0.18161430954933167}, {'label': 'Positive', 'score': 0.031944192945957184}]]
[[{'label': 'Negative', 'score': 0.2174510806798935}, {'label': 'Neutral', 'score': 0.6718700528144836}, {'label': 'Positive', 'score': 0.11067892611026764}]]
[[{'label': 'Negative', 'score': 0.041672900319099426}, {'label': 'Neutral', 'score': 0.8681623339653015}, {'label': 'Positive', 'score': 0.09016476571559906}]]
[[{'label': 'Negative', 'score': 0.14725720882415771}, {'label': 'Neutral', 'score': 0.5612739324569702}, {'label': 'Positive', 'score': 0.2914688289165497}]]
[[{'label': 'Negative', 'score': 0.024468176066875458}, {'label': 'Neutral', 'score': 0.5341298580169678}, {'label': 'Positive', 'score': 0.44140198826789856}]]
[[{'label': 'Negative', 'score': 0.5299965739250183}, {'label': 'Neutral', 'score': 0.453073114156723}, {'label': 'Positive', 'score': 0.01693030074238777}]]
[[{'label': 'Negative', 'score': 0.003819

ZeroDivisionError: division by zero

In [None]:
output

Unnamed: 0,Ticker,Mentions,Negative Average,Positive Average,Overall Sentiment,Top 20 Tweets,20 Quality Tweets
0,$GME,624,0.917613,0.87462,Negative,[gme flow putcall ratio call premium million ...,[lol you think people are gonna sell amc while...
1,$TSLA,2376,0.903769,0.920804,Negative,[fortunate timing for tesla opening two ev fac...,"[tsla crashing, tsla cnbc faked us by making i..."
2,$TWTR,2222,0.906162,0.902414,Negative,[musk admits his twitter investment isnt passi...,[twtr mania if you think there is a takeover i...
3,$AMC,3100,0.887694,0.885642,Negative,[hey charlesyou recommended the weekly amc put...,"[amc oops blew your load to early huh ken, amc..."
4,$SPY,4866,0.904432,0.918925,Negative,[when spy gets short term oversold amp comes i...,[spy watching it test avwap lower range of th...
5,$HMHC,31,0.9516,0.9285,Negative,[amusing to see rwallstreetbets going after th...,[houghton mifflin harcourt company traded m sh...
6,$DWAC,237,0.930353,0.916121,Negative,[dwac flow putcall ratio call premium million...,[dwac s delayed issued press release on april ...
7,$AMD,743,0.942752,0.87244,Negative,[amd inbound for another retest of the price ...,[amd like it here for a long just need a boun...
8,$SST,383,0.929826,0.890256,Negative,[sst calls purchased last week returned toda...,"[sst with da bounce, sst that is all, join our..."
9,$AAPL,671,0.931726,0.89989,Negative,[aapl has surpassed million option contracts t...,[crazy bc stocks have already fallen or more ...


In [None]:
data = pd.read_csv("/work/Modeling/Outputs/twitter 04-02-2022.csv")

In [None]:
data['Top 20 Tweets']

0     ['about to drop this  gme pokey editions stay ...
1     ['my wife is my guest to the cyber rodeo and i...
2     ['twtr got the break and retest but left us wi...
3     ['this is not accurate naked shorting abuses o...
4     ['spy resistance turned support turned resista...
5     ['hmhc open interest increases  on the week to...
6     ['donald trumps social media app was supposed ...
7     ['amd head and shoulders forming on the weekly...
8     ['im just saying it seems like theres some sen...
9     ['the market can be confusing but following un...
10    ['amzn super strong in aprilpay no mind to sep...
11    ['via heat map of how mega cap stocks b market...
12    ['and now a thread on stupidity see whiny twee...
13    ['netflix bought m carbon credits in this cut ...
14    ['coach i need to see another qqq pump commerc...
15    ['i took the time to normalize pltr quarterly ...
16    ['fb longterm trendline holds as support with ...
17    ['the market has a way of making you doubt

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=643f0a0a-e649-4860-b73b-f3561d8b41c9' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>