## 1. Load libraries & data

In [9]:
# Import required libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Load the CSV into a DataFrame
df = pd.read_csv('tweets-data.csv')

# Inspect the first few rows
df.head()

[nltk_data] Downloading package stopwords to /Users/yael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/yael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0.1,Unnamed: 0,Date Created,Number of Likes,Source of Tweet,Tweets,hashtag
0,0,2023-06-25 19:16:20+00:00,0,,@jacksonhinklle #wagner with 6.2 billion dolla...,wagner
1,1,2023-06-25 19:16:18+00:00,0,,Pobrecito es discapacitado\n#Reddetuiterosdemo...,wagner
2,2,2023-06-25 19:16:07+00:00,0,,News from the EIR Daily Alert\n\n“#Putin Addre...,wagner
3,3,2023-06-25 19:15:56+00:00,0,,It's Messi day #Messi𓃵 #Messi36 #Russia #bigst...,wagner
4,4,2023-06-25 19:15:54+00:00,0,,Il passaggio chiave di Machiavelli era questo ...,wagner


## 2. Define Sentiment Function

We create a function get_sentiment(text) that returns a tuple (label, score) where:

- label is one of "positive", "neutral", "negative"
- score is the VADER compound score

In [10]:
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text: str) -> tuple:
    """
    Compute VADER sentiment for a given text.
    Returns:
        sentiment_label (str): 'positive', 'neutral', or 'negative'
        compound_score (float): the VADER compound score
    """
    vs = analyzer.polarity_scores(text)
    compound = vs['compound']
    if compound >=  0.05:
        label = 'positive'
    elif compound <= -0.05:
        label = 'negative'
    else:
        label = 'neutral'
    return label, compound

## 3. Clean Tweets Text

We define clean_text(text) to:

- Convert to lowercase
- Remove URLs, mentions (@user), hashtags (#tag)
- Strip out non-alphabetic characters
- Tokenize and remove English stopwords
- Reconstruct the cleaned sentence

In [11]:
# Prepare stopword list
stop_words = set(stopwords.words('english'))

def clean_text(text: str) -> str:
    """
    Clean tweet text by:
      - lowercasing
      - removing URLs, mentions, hashtags
      - removing non-letter characters
      - tokenizing and removing stopwords
      - rejoining cleaned tokens
    """
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)      # remove URLs
    text = re.sub(r'@\w+|#\w+', '', text)            # remove mentions/hashtags
    text = re.sub(r'[^a-z\s]', '', text)             # remove non-letters
    tokens = word_tokenize(text)
    tokens = [tok for tok in tokens if tok not in stop_words]
    return ' '.join(tokens)

# Apply cleaning function
nltk.download('punkt_tab')
df['cleaned_text'] = df['Tweets'].apply(clean_text)

[nltk_data] Downloading package punkt_tab to /Users/yael/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## 4. Apply Sentiment Function and Add Columns

Finally, we apply ```get_sentiment``` to the ```cleaned_text``` column, unpack the results, and store them in two new columns: ```sentiment_label``` and ```sentiment_score```.

In [12]:
# Apply sentiment analysis to each cleaned tweet
sentiments = df['cleaned_text'].apply(get_sentiment)
df['sentiment_label'] = sentiments.apply(lambda x: x[0])
df['sentiment_score'] = sentiments.apply(lambda x: x[1])

# Display the updated DataFrame
df.head()

Unnamed: 0.1,Unnamed: 0,Date Created,Number of Likes,Source of Tweet,Tweets,hashtag,cleaned_text,sentiment_label,sentiment_score
0,0,2023-06-25 19:16:20+00:00,0,,@jacksonhinklle #wagner with 6.2 billion dolla...,wagner,billion dollar,neutral,0.0
1,1,2023-06-25 19:16:18+00:00,0,,Pobrecito es discapacitado\n#Reddetuiterosdemo...,wagner,pobrecito es discapacitado,neutral,0.0
2,2,2023-06-25 19:16:07+00:00,0,,News from the EIR Daily Alert\n\n“#Putin Addre...,wagner,news eir daily alert addressed people armed ju...,positive,0.296
3,3,2023-06-25 19:15:56+00:00,0,,It's Messi day #Messi𓃵 #Messi36 #Russia #bigst...,wagner,messi day,neutral,0.0
4,4,2023-06-25 19:15:54+00:00,0,,Il passaggio chiave di Machiavelli era questo ...,wagner,il passaggio chiave di machiavelli era questo ...,neutral,0.0
