In [4]:
"""This part imports essential libraries and modules for the sentiment analysis procedure.
- `pandas`: Utilised for data manipulation and analysis, particularly for managing twitter data in DataFrame format.
- `torch`: The PyTorch library utilised for managing deep learning models, specifically for loading and executing the RoBERTa model.
- `RobertaTokenizer` and `RobertaForSequenceClassification`: Pre-trained tokeniser and model from the Hugging Face Transformers library, designed specifically for sentiment classification utilising the RoBERTa architecture.
- `softmax`: A PyTorch function that transforms raw model outputs (logits) into probabilities.
- `ast`: Offers utilities for parsing and evaluating Python expressions, advantageous for managing stored string data as dictionaries or lists.
- `re`: A module for regular expressions utilised in text processing and purification.
- `datetime`: A module for managing date and time functions, including the conversion of twitter timestamps."""

import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.nn.functional import softmax
import ast
import re
from datetime import datetime


tokenizer = RobertaTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
model = RobertaForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')

df = pd.read_csv('New_file.csv')  

df.drop_duplicates(inplace=True)

df.dropna(subset=['Content'], inplace=True)

"""This function `clean_tweet` refines raw tweet text by eliminating non-textual components that are irrelevant to sentiment analysis.

- `re.sub(r'http\S+', '', text)`: Eliminates URLs from the tweet content.
- `re.sub(r'@\w+', '', text)`: Eliminates mentions (e.g., @username) to concentrate on the content.
- `re.sub(r'#\w+', '', text)`: Eliminates hashtags while retaining the other content.
- `re.sub(r'[^A-Za-z0-9\s]+', '', text)`: Eliminates special characters, retaining only alphanumeric characters and spaces.
- `text.lower()`: Transforms the text to lowercase for consistency.
- `text.strip()`: Eliminates any preceding or succeeding whitespace.
The purified text is thereafter provided, prepared for sentiment analysis.
"""
def clean_tweet(text):
    text = re.sub(r'http\S+', '', text)  
    text = re.sub(r'@\w+', '', text)  
    text = re.sub(r'#\w+', '', text)  
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text) 
    text = text.lower()  
    return text.strip()

df['Content'] = df['Content'].apply(lambda x: clean_tweet(str(x)))

def hashtags_conversion(ht):
    try:
        return ast.literal_eval(ht) if isinstance(ht, str) else ht
    except:
        return []

df['Hashtags'] = df['Hashtags'].apply(hashtags_conversion)


def check_crypto(content, hashtags, crypto):
    text = (content if isinstance(content, str) else '').lower()
    hashtags_text = ' '.join(hashtags) if isinstance(hashtags, list) else ''
    return crypto in text or crypto in hashtags_text.lower()

"""The `get_sentiment` function forecasts the sentiment of the supplied text via a pre-trained RoBERTa model.

If the supplied text is null or invalid, it produces a default neutral emotion score of [0.33, 0.34, 0.33].
Utilises `tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)` to tokenise the input text and format it for model processing as PyTorch tensors compatible with RoBERTa.
- `model(**encoded_input)`: Inputs the tokenised data into the RoBERTa model for sentiment analysis.
- `softmax(output.logits, dim=-1)[0].tolist()`: Implements the softmax function on the model's output logits, transforming them into probability scores for negative, neutral, and positive sentiments.
The function yields the anticipated probabilities, indicating the emotion distribution."""

def predict_sentiment(text):
    if not text:
        return [0.33, 0.34, 0.33]  
    tokenised_input = text_tokeniser(cleaned_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    sentiment_output = sentiment_classifier(**tokenised_input)
    sentiment_probabilities = softmax(sentiment_output.logits, dim=-1)[0].tolist() 
    return sentiment_probabilities

"""This code enables the user to specify a certain date and subsequently receives up to 10 random tweets from that date utilising the select_random_tweets function. 
Subsequently, it employs a sentiment analysis function (get_sentiment) on each tweet to ascertain whether the sentiment is negative, neutral, or positive. 
The algorithm computes the average sentiment of the selected tweets and generates the forecasted sentiment scores for the specified date.
The user is informed if no tweets are accessible for the selected date."""

def select_random_tweets(df, date, num_tweets=10):
    df['Date'] = pd.to_datetime(df['Date']).dt.date 
    selected_date = pd.to_datetime(date).date()  
    date_tweets = df[df['Date'] == selected_date]  
    return date_tweets.sample(n=min(num_tweets, len(date_tweets))) 


results = []
cryptos = ['bitcoin', 'ethereum', 'ripple', 'litecoin', 'cardano']
for crypto in cryptos:
    crypto_df = df[df.apply(lambda x: check_crypto(x['Content'], x['Hashtags'], crypto), axis=1)]
    crypto_df['sentiment'] = crypto_df['Content'].apply(predict_sentiment)
    avg_sentiment = crypto_df['sentiment'].apply(pd.Series).mean().tolist()
    results.append({
        'Crypto': crypto,
        'Negative': avg_sentiment[0],
        'Neutral': avg_sentiment[1],
        'Positive': avg_sentiment[2]
    })


results_df = pd.DataFrame(results)
print(results_df)

user_date = input("Please enter the date (YYYY-MM-DD) for which you want to analyze tweets: ")

random_tweets = select_random_tweets(df, user_date)
if random_tweets.empty:
    print(f"No tweets found for the date {user_date}.")
else:
    print("Random Tweets from the Specified Date:")
    print(random_tweets[['Date', 'Content']])

    random_tweets['predicted_sentiment'] = random_tweets['Content'].apply(get_sentiment)
    
    average_sentiment = random_tweets['predicted_sentiment'].apply(pd.Series).mean().tolist()
    
    print("Predicted Sentiment for the Selected Tweets:")
    print(f"Negative: {average_sentiment[0]:.4f}")
    print(f"Neutral: {average_sentiment[1]:.4f}")
    print(f"Positive: {average_sentiment[2]:.4f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crypto_df['sentiment'] = crypto_df['Content'].apply(get_sentiment)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crypto_df['sentiment'] = crypto_df['Content'].apply(get_sentiment)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crypto_df['sentiment'] = crypto_df['Content'].apply(get_sentiment)
A va

     Crypto  Negative   Neutral  Positive
0   bitcoin  0.161379  0.652599  0.186022
1  ethereum  0.093167  0.738724  0.168109
2    ripple  0.174270  0.725894  0.099836
3  litecoin  0.067808  0.560893  0.371299
4   cardano  0.133603  0.688160  0.178237
Please enter the date (YYYY-MM-DD) for which you want to analyze tweets: 2022-11-30
Random Tweets from the Specified Date:
            Date                                            Content
6746  2022-11-30                                                   
4954  2022-11-30  roboto games secures 15 million in series a fu...
8715  2022-11-30  zepverse is the multiverse \nwhere you can\nex...
8645  2022-11-30                                                   
4583  2022-11-30                                       when  pump 1
2930  2022-11-30  south korean prosecutors seek arrest warrants ...
6048  2022-11-30  whale alert in  usdt 12070704  120472210 from ...
1339  2022-11-30  1  1693674 257\n\ndetails\nchange 42369\n24h l...
9944  2022-11

In [2]:
df.head()

Unnamed: 0,Column1,Date,Username,Content,URL,Hashtags
0,0,2022-11-30 11:53:21+00:00,0xEthereumYoda,#Ethereum price update: \n\n#ETH $1269.23 USD\...,https://twitter.com/0xEthereumYoda/status/1597...,"[Ethereum, ETH, Bitcoin, BTC, altcoin, cryptoc..."
1,1,2022-11-30 11:53:21+00:00,Lawrenc32984128,@mtmalinen @ecb Do well to understand that eve...,https://twitter.com/Lawrenc32984128/status/159...,[BTC]
2,2,2022-11-30 11:53:21+00:00,NITESHP55784410,Kayla #Ethereum Harold #世界杯 Egbert #百家乐 Lavern...,https://twitter.com/NITESHP55784410/status/159...,"[Ethereum, 世界杯, 百家乐, 世界杯买球]"
3,3,2022-11-30 11:53:20+00:00,817coin,#Bitcoin https://t.co/2koLlCvCri https://t.co/...,https://twitter.com/817coin/status/15979217174...,[Bitcoin]
4,4,2022-11-30 11:53:18+00:00,slamtoken,"Yesterday we made a 3,000 SLAM buyback and loc...",https://twitter.com/slamtoken/status/159792171...,"[slamtoken, bnb, bsc]"
