## 1. Load libraries & data

In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import pipeline

# Download NLTK resources (only the first time)
nltk.download('stopwords')
nltk.download('punkt')

# Load the CSV into a DataFrame
df = pd.read_csv('tweets-data.csv')

# Peek at the first rows to confirm the 'tweet' column exists
df.head()

[nltk_data] Downloading package stopwords to /Users/yael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/yael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0.1,Unnamed: 0,Date Created,Number of Likes,Source of Tweet,Tweets,hashtag
0,0,2023-06-25 19:16:20+00:00,0,,@jacksonhinklle #wagner with 6.2 billion dolla...,wagner
1,1,2023-06-25 19:16:18+00:00,0,,Pobrecito es discapacitado\n#Reddetuiterosdemo...,wagner
2,2,2023-06-25 19:16:07+00:00,0,,News from the EIR Daily Alert\n\n“#Putin Addre...,wagner
3,3,2023-06-25 19:15:56+00:00,0,,It's Messi day #Messi𓃵 #Messi36 #Russia #bigst...,wagner
4,4,2023-06-25 19:15:54+00:00,0,,Il passaggio chiave di Machiavelli era questo ...,wagner


## 2. Clean the Tweets Text

Define a function ```clean_text``` that:

- Converts to lowercase
- Removes URLs, mentions (@user), and hashtags (#tag)
- Strips out non-alphabetic characters
- Tokenizes and removes English stopwords
- Reconstructs the cleaned sentence

In [4]:
# Prepare English stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text: str) -> str:
    """
    Clean tweet text by:
      - lowercasing
      - removing URLs, mentions, hashtags
      - removing non-letter characters
      - tokenizing and removing stopwords
      - rejoining cleaned tokens into a sentence
    """
    text = text.lower()
    text = re.sub(r'http\S+|www\.\S+', '', text)     # remove URLs
    text = re.sub(r'@\w+|#\w+', '', text)             # remove mentions/hashtags
    text = re.sub(r'[^a-z\s]', '', text)              # remove non-alphabetic chars
    tokens = word_tokenize(text)                      # tokenize
    tokens = [tok for tok in tokens if tok not in stop_words and len(tok) > 1]
    return ' '.join(tokens)

# Apply cleaning to the DataFrame
df['cleaned_text'] = df['Tweets'].astype(str).apply(clean_text)

# Show the cleaned text
df[['Tweets', 'cleaned_text']].head()

Unnamed: 0,Tweets,cleaned_text
0,@jacksonhinklle #wagner with 6.2 billion dolla...,billion dollar
1,Pobrecito es discapacitado\n#Reddetuiterosdemo...,pobrecito es discapacitado
2,News from the EIR Daily Alert\n\n“#Putin Addre...,news eir daily alert addressed people armed ju...
3,It's Messi day #Messi𓃵 #Messi36 #Russia #bigst...,messi day
4,Il passaggio chiave di Machiavelli era questo ...,il passaggio chiave di machiavelli era questo ...


## 3. Sentiment Analysis with a Transformers Pipeline

We initialize the sentiment-analysis pipeline (which by default uses a distilbert-based model fine-tuned on SST-2). Then we apply it in batch to our cleaned tweets.

In [8]:
from transformers import pipeline

# Initialize the pipeline as before
classifier = pipeline('sentiment-analysis')

# Apply the classifier with truncation to avoid sequences >512 tokens
results = classifier(
    df['cleaned_text'].tolist(),
    batch_size=32,
    truncation=True,     # truncate any input longer than model max_length
    max_length=512       # optional: explicitly cap at 512 tokens
)

# Unpack the results into new DataFrame columns
df['sentiment_label'] = [res['label'] for res in results]
df['sentiment_score'] = [res['score'] for res in results]

# Check the DataFrame
df.head()

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


Unnamed: 0.1,Unnamed: 0,Date Created,Number of Likes,Source of Tweet,Tweets,hashtag,cleaned_text,sentiment_label,sentiment_score
0,0,2023-06-25 19:16:20+00:00,0,,@jacksonhinklle #wagner with 6.2 billion dolla...,wagner,billion dollar,POSITIVE,0.999244
1,1,2023-06-25 19:16:18+00:00,0,,Pobrecito es discapacitado\n#Reddetuiterosdemo...,wagner,pobrecito es discapacitado,NEGATIVE,0.574673
2,2,2023-06-25 19:16:07+00:00,0,,News from the EIR Daily Alert\n\n“#Putin Addre...,wagner,news eir daily alert addressed people armed ju...,NEGATIVE,0.878521
3,3,2023-06-25 19:15:56+00:00,0,,It's Messi day #Messi𓃵 #Messi36 #Russia #bigst...,wagner,messi day,NEGATIVE,0.999102
4,4,2023-06-25 19:15:54+00:00,0,,Il passaggio chiave di Machiavelli era questo ...,wagner,il passaggio chiave di machiavelli era questo ...,POSITIVE,0.8503
