# Twitter Preprocessing + Model

In [None]:
#imports 
import pandas as pd
import numpy as np
import requests 

# Model Functions

In [None]:
headers = {"Authorization": "Bearer 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"}

# Model 1: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment 

#labels needed to reference sentiment
model1_dict = {'LABEL_0': "Negative", "LABEL_1": "Neutral", "LABEL_2": "Positive"} 

model1 = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment"

# Model 2: https://huggingface.co/mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis 

# label dict not needed, output displays score + sentiment 
model2 = "https://api-inference.huggingface.co/models/mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"

# Model 3: https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english
model3 = "https://api-inference.huggingface.co/models/distilbert-base-uncased-finetuned-sst-2-english"


def get_sentiment(string, model, type = None):
    #string - text to run through model 
    #model - model url (reference above) 
    #output types: score, label 
    headers = {"Authorization": "Bearer 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"}

    #access model + obtain ouput
    payload = query = {"inputs": string}
    #print(payload)
    response = requests.post(model, headers = headers, json = query) 
    #print(response.json())
    output = response.json()[0]

    #remove neutral label
    if len(output) == 3: 
        output.pop(1)

    best = max(output, key = lambda x: x['score'])
    label = str.lower(best['label'])
    score = np.round(best['score'], decimals = 3)

    #non labeled - model
    if model == model1: 
        model1_dict = {'LABEL_0': "Negative", "LABEL_1": "Neutral", "LABEL_2": "Positive"}        
        label = str.lower(model1_dict[best['label']])
        score = np.round(best['score'], decimals =3)

    #desired output
    if type == "score": 
        return score
    if type == "label": 
        return label

    return label, score
    

vec_sentiment = np.vectorize(get_sentiment)

# MAIN FUNCTION

In [None]:
def process_Twitter(data): 
    #get data dic
    dataDic = split_by_ticker(data)
    # get tickers 


    tickers = list(dataDic.keys())
    mentions, negative_Avg, positive_Avg, overall = [], [], [], []
    top20, just20 = [],[]

    for ticker in tickers: 
        mention_count, stock, top20tweets, just20tweets = run_model(dataDic[ticker]) 
        results = process_Sentiment(stock)
        
        mentions.append(mention_count)
        negative_Avg.append(results['Negative Average'])
        positive_Avg.append(results['Positive Average'])
        overall.append(results['Overall Sentiment']) 
        top20.append(top20tweets)
        just20.append(just20tweets)

    
    output = pd.DataFrame({"Ticker": tickers,
                          "Mentions": mentions, 
                          "Negative Average": negative_Avg, 
                          "Positive Average": positive_Avg, 
                          "Overall Sentiment": overall,
                          "Top 20 Tweets": top20, 
                          "20 Quality Tweets": just20
                            })
    
    return output

In [None]:
#returns dictionary of dataframes for each stock ticker
def split_by_ticker(data): 
    stock_tickers = data['ticker'].unique() 

    DataFrameDict = {elem : pd.DataFrame for elem in stock_tickers}
    
    for key in DataFrameDict.keys():
        #process each dataframe 
        columns = {'full_text_preprocessed': 'text'}
        DataFrameDict[key] = data[data['ticker'] == key].copy().rename(columns = columns)
    
    return DataFrameDict


In [None]:
#have have this return top20 (tweets), just20 (filter on quality)

def run_model(ticker_df):  
    
    models = [model1, model2, model3] 
    model_dict = {model1: 'model1', model2: 'model2', model3: 'model3'}
    mentions = len(ticker_df) 

    top20 = get_top20(ticker_df) 
    just20 = get_just20(ticker_df)

    if mentions > 150: 
        data = ticker_df.sample(150) 
    else: 
        data = ticker_df 

    text = data['text'].values 
    
    for model in models: 
        label = model_dict[model] + "Sentiment" 
        result_label = model_dict[model] + "Score"  

        sentiment, score = vec_sentiment(text, model)
        data[label] = sentiment 
        data[result_label] = score 

    return mentions, data, top20, just20
    

In [None]:
def process_Sentiment(sentimment_Data): 
    neg_m1 = sentimment_Data[sentimment_Data['model1Sentiment'] == 'negative']['model1Score'].values 
    neg_m2 = sentimment_Data[sentimment_Data['model2Sentiment'] == 'negative']['model2Score'].values
    neg_m3 = sentimment_Data[sentimment_Data['model3Sentiment'] == 'negative']['model3Score'].values
    neg_arr = np.concatenate((neg_m1, neg_m2, neg_m3), axis = None)
    neg_avg = np.average(neg_arr)

    pos_m1 = sentimment_Data[sentimment_Data['model1Sentiment'] == 'positive']['model1Score'].values 
    pos_m2 = sentimment_Data[sentimment_Data['model2Sentiment'] == 'positive']['model2Score'].values
    pos_m3 = sentimment_Data[sentimment_Data['model3Sentiment'] == 'positive']['model3Score'].values
    pos_arr = np.concatenate((pos_m1, pos_m2, pos_m3), axis = None)
    pos_avg = np.average(pos_arr)

    total_vals = len(neg_arr) + len(pos_arr)
    neg_weight, pos_weight = len(neg_arr) / total_vals, len(pos_arr) / total_vals
    weighted_neg, weighted_pos = neg_avg * neg_weight, pos_avg * pos_weight 
    sentiment_ratio = neg_avg / pos_avg

    final_sentiment = 'Negative'
    if sentiment_ratio == 1:
        final_sentiment = np.random.choice('Negative', 'Positive')
    elif sentiment_ratio < 1:
        final_sentiment = 'Positive'
    
    output_dict = {'Negative Average' : neg_avg, 'Positive Average' : pos_avg, 'Overall Sentiment' : final_sentiment}
    return output_dict

In [None]:
def get_top20(tweets): 
    
    data = tweets.sort_values(by = ['user.verified', 'retweet_count','favorite_count'], ascending=False).copy()
    text = data['text'].values[0:20]

    return text

In [None]:
def get_just20(tweets): 

    if len(tweets) > 20:
        text = tweets['text'].sample(n= 20,  replace=False).values
    
    else: 
        text = tweets['text'].values
 
    return text


In [None]:
raw_data = pd.read_csv("/work/twitter 04-02-2022.csv") 
processed = process_Twitter(raw_data)
processed.to_csv("/work/Modeling/Outputs/twitter 04-02-2022.csv")

In [None]:
raw_data = pd.read_csv("/work/twitter 04-03-2022.csv") 
processed = process_Twitter(raw_data)
processed.to_csv("/work/Modeling/Outputs/twitter 04-03-2022.csv")

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


In [None]:
raw_data = pd.read_csv("/work/twitter 04-04-2022.csv") 
processed = process_Twitter(raw_data)
processed.to_csv("/work/Modeling/Outputs/twitter 04-04-2022.csv")

In [None]:
raw_data = pd.read_csv("/work/twitter 04-05-2022.csv") 
processed = process_Twitter(raw_data)
processed.to_csv("/work/Modeling/Outputs/twitter 04-05-2022.csv")

In [None]:
raw_data = pd.read_csv("/work/twitter 04-06-2022.csv") 
processed = process_Twitter(raw_data)
processed.to_csv("/work/Modeling/Outputs/twitter 04-06-2022.csv")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=643f0a0a-e649-4860-b73b-f3561d8b41c9' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>