# Load the CSV
We load the csv file created by the scraper in first part.


In [None]:
from pathlib import Path

import pandas as pd
from pandas import DataFrame

from utils import get_path

tweet_path: Path = get_path("./tweets_backup_15-02.csv")
df: DataFrame = pd.read_csv(tweet_path, parse_dates=['created_at'], date_format='%a %b %d %H:%M:%S %z %Y',
                            dtype={'id': int,
                                   'username': 'string',
                                   'text': 'string',
                                   'retweets': int,
                                   'likes': int})
df.set_index('id', inplace=True)
df.info()
df.describe()

## General infos

In [None]:
df.head()

## Numerics

In [None]:
most_retweeted = df.nlargest(10, "retweets")[["username", "text", "retweets"]]
most_retweeted

In [None]:

most_liked = df.nlargest(10, "likes")[["username", "text", "likes"]]
most_liked


## Time analysis
We try to understand tweets time distribution

In [None]:
# Extract date and hour for analysis
df["date"] = df["created_at"].dt.date
df["hour"] = df["created_at"].dt.hour
df.head()

In [None]:
tweets_per_day = df.groupby("date")["text"].count().to_frame(name="tweets")
tweets_per_day


In [None]:
tweets_per_hour = df.groupby("hour")["text"].count().to_frame(name="tweets")
tweets_per_hour

In [None]:
import matplotlib.pyplot as plt

tweets_per_hour.plot(kind="bar", figsize=(10, 5), color="royalblue")
plt.xlabel("Hour of the Day")
plt.ylabel("Tweet Count")
plt.title("Tweet Frequency by Hour")
plt.show()

## Most active users

In [None]:
active_users = df["username"].value_counts().head(10)
active_users

## Words Analysis

### Frequency

In [None]:
import re
import matplotlib.pyplot as plt
from collections import Counter
from nltk.corpus import stopwords
import nltk
# Load Italian
nltk.download('stopwords')
stopwords_it = set(stopwords.words("italian"))

# Combine all tweets into a single text
all_text = " ".join(df["text"]).lower()

# Extract words (only those longer than 6 characters & not in stopwords)
words = [
    word for word in re.findall(r'\b\w+\b', all_text)
    if len(word) > 6 and word not in stopwords_it
]

# Count word occurrences
word_freq = Counter(words).most_common(20)
word_df = pd.DataFrame(word_freq, columns=["word", "frequency"])
word_df


In [None]:
# Set style
import seaborn as sns
plt.figure(figsize=(12, 6))
sns.barplot(x="frequency", y="word", data=word_df, palette="viridis")

# Titles and labels
plt.xlabel("Count")
plt.ylabel("Word")
plt.title("Top 20 Most Frequent Words in Sanremo Tweets")
plt.grid(axis="x", linestyle="--", alpha=0.7)

# Show plot
plt.show()

### Hashtags

In [None]:
from collections import Counter
import re


# Function to clean and extract words
def extract_words(text):
    words = re.findall(r"#\w+", text.lower())  # Extract hashtags
    return words

all_hashtags = df["text"].dropna().apply(extract_words)
hashtags = Counter([ht for sublist in all_hashtags for ht in sublist])
hashtags.most_common(10)

## Sentiment Analysis

In [None]:
from textblob import TextBlob

# Function to get sentiment polarity (-1 to 1)
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Apply sentiment analysis
df["sentiment"] = df["text"].apply(get_sentiment)

# Average sentiment score
df["sentiment"].describe()


In [None]:
# Tweets with most positive sentiment
df.sort_values(by="sentiment", ascending=False)[["username", "text", "sentiment"]].head(5)

In [None]:
# Tweets with most negative sentiment
df.sort_values(by="sentiment", ascending=True)[["username", "text", "sentiment"]].head(5)

## Final Analysis

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
nomi_artisti = [
    "Lauro", "Bresh", "Brunori", "Clara", "Coma_Cose", "Elodie", "Killa", "Fedez",
    "Michielin", "Gabbani", "Gaia", "Giorgia", "Irama", "Thiele", "Corsi", "Bella",
    "Ranieri", "Modà", "Noemi", "Olly", "Rkomi", "Hunt", "Villain", "Toscano",
    "Brancale", "Shablo", "Cristicchi", "Kolors", "Effe", "Peyote"
]
frequenze = {nome: df['text'].str.contains(rf'\b{nome}\b', case=False, na=False).sum() for nome in nomi_artisti}

df_frequenze = pd.DataFrame(frequenze.items(), columns=['artist', 'frequency'])
df_frequenze = df_frequenze[df_frequenze['frequency'] > 0]  # Rimuoviamo quelli con 0 occorrenze

# Ordina per frequenza
df_frequenze = df_frequenze.sort_values(by='frequency', ascending=False)

# Grafico a barre
plt.figure(figsize=(12, 6))
sns.barplot(data=df_frequenze, x='artist', y='frequency', palette='magma')
plt.xticks(rotation=45, ha='right')
plt.title("Frequency artists in tweets")
plt.xlabel("Artists")
plt.ylabel("Number of tweets")
plt.show()