In [2]:
import csv
import pandas as pd 
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import numpy as np
import pickle

In [3]:
df = pd.read_csv("C:\\Users\\Connor\\Desktop\\Coding\\nft_market_a\\scripts_\\twitter_nft_timeline.csv")
df.head()

Unnamed: 0,Tweet ID,Username,User ID,Tweet Date,Text
0,1658583425047994368,SoldaDDH,1496876147380006915,2023-05-16 21:21:18+00:00,@factions_nft @_MatrixNFT @0xPolygonLabs YEAH
1,1658583423705808897,Kresusofficial,1582118621429391361,2023-05-16 21:21:17+00:00,🎨 Dive into the #NFT world with Kresus! \n\nMi...
2,1658583409609035776,JohnDaWinters,1450824144908652551,2023-05-16 21:21:14+00:00,XRP Ledger Among Top 10 NFT Chains with Over 1...
3,1658583403753533441,deLarg0,113960166,2023-05-16 21:21:13+00:00,@ulices_vc_nft caileeee
4,1658583392873488387,ContentLawyer,209614352,2023-05-16 21:21:10+00:00,"First Free Spacebrat #NFT Minted Over 300,000 ..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Tweet ID    17 non-null     int64 
 1   Username    17 non-null     object
 2   User ID     17 non-null     int64 
 3   Tweet Date  17 non-null     object
 4   Text        17 non-null     object
dtypes: int64(2), object(3)
memory usage: 808.0+ bytes


In [5]:
# Function to clean words up in data
def process_text(tweet):
    tweet_words = []
    for word in tweet.split(' '):
        if word.startswith('@') and len(word) > 1: # Converts their username to '@user'
            word = '@user'
        elif word.startswith('http'): # Converts website address to 'http'
            word = "http"
        tweet_words.append(word)
    tweet_proc = " ".join(tweet_words)
    return tweet_proc

In [6]:
df.head()

Unnamed: 0,Tweet ID,Username,User ID,Tweet Date,Text
0,1658583425047994368,SoldaDDH,1496876147380006915,2023-05-16 21:21:18+00:00,@factions_nft @_MatrixNFT @0xPolygonLabs YEAH
1,1658583423705808897,Kresusofficial,1582118621429391361,2023-05-16 21:21:17+00:00,🎨 Dive into the #NFT world with Kresus! \n\nMi...
2,1658583409609035776,JohnDaWinters,1450824144908652551,2023-05-16 21:21:14+00:00,XRP Ledger Among Top 10 NFT Chains with Over 1...
3,1658583403753533441,deLarg0,113960166,2023-05-16 21:21:13+00:00,@ulices_vc_nft caileeee
4,1658583392873488387,ContentLawyer,209614352,2023-05-16 21:21:10+00:00,"First Free Spacebrat #NFT Minted Over 300,000 ..."


In [7]:
df['processed_text'] = df['Text'].apply(process_text)

print(df['processed_text'])

0                                @user @user @user YEAH
1     🎨 Dive into the #NFT world with Kresus! \n\nMi...
2     XRP Ledger Among Top 10 NFT Chains with Over 1...
3                                        @user caileeee
4     First Free Spacebrat #NFT Minted Over 300,000 ...
5     @user dammnn welcome diabetes :P ahhaha GM and...
6     I've just created an NFT Collection on Goerli ...
7     Ubisoft Wants To Sell Ezio's 'Digital Soul' An...
8     #NFT Time! http one day volume is 34.68 $eth\n...
9     Another, and probably my last Nft stream on wh...
10        Let’s $SWTS, let’s $MOB, let’s Goo @user http
11                   @user @user $OXBT let’s fucking go
12                         l wanna buy cool #NFT 😍😍💸💸⬇️
13    @user @user You know it! @user was a rug and t...
14    @user @user Oh fxck!!! I need to go undock my ...
15    @user @user Thank you so much for joining us t...
16                        @user Accept the challenge. k
Name: processed_text, dtype: object


In [8]:
# Load Model
# Specify the model
roberta = "cardiffnlp/twitter-roberta-base-sentiment"

# Load model
model = AutoModelForSequenceClassification.from_pretrained(roberta)

# Load pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained(roberta)

# Define labels
labels = ['Negative', 'Neutral', 'Positive']

In [10]:
# Sentiment analysis

# Initialise a variable to store the total scores
total_scores = np.zeros((1, len(labels)))

for tweet_proc in df['processed_text']:
    # Tokenize the tweet and return PyTorch tensors
    encoded_tweet = tokenizer(tweet_proc, return_tensors='pt')
    
    # Feed the encoded input IDs and attention masks to get output logits
    output = model(encoded_tweet['input_ids'], encoded_tweet['attention_mask'])
    
    # Detach the logits from the computational graph and convert them into a NumPy array
    logits = output.logits.detach().numpy()
    
    # Apply the softmax function to the logits to obtain probability scores for each category
    scores = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)

    
    # Add the scores to the total scores
    total_scores += scores
    
# Calculate the average scores 
average_scores = total_scores / len(df['processed_text'])

# Find the index of the highest average score
max_index = np.argmax(average_scores)

print(f"{labels[max_index]}: {average_scores[0][max_index]}")

Positive: 0.43559683295076385


In [12]:
# Save model
model.save_pretrained('C:\\Users\\Connor\\Desktop\\Coding\\nft_market_a\\models\\roberta_sentiment_model')