# Setting Up

In [1]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.0


In [2]:
import pandas as pd
import re
import json
import emoji
from collections import Counter
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

# Loading the Data

In [7]:
def load_twitter_data(file_paths):

    all_tweets = []

    for file_path in file_paths:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

            # Remove the JavaScript variable declaration part
            json_str = re.search(r'\[.*\]', content, re.DOTALL)

            if json_str:
                tweets = json.loads(json_str.group())

                # Twitter data is usually nested, with tweet info in 'tweet' key
                if isinstance(tweets, list) and tweets and 'tweet' in tweets[0]:
                    tweets = [t['tweet'] for t in tweets]

                all_tweets.extend(tweets)
            else:
                print(f"No JSON data found in file: {file_path}")

        except Exception as e:
            print(f"Error loading Twitter data from {file_path}: {str(e)}")

    if all_tweets:
        # Convert to DataFrame and clean up
        df = pd.DataFrame(all_tweets)

        # Convert tweet creation timestamps to datetime
        if 'created_at' in df.columns:
            df['created_at'] = pd.to_datetime(df['created_at'])

        return df
    return None

# Example usage
file_paths = [
    "tweets.js",  # First file
    "tweets-part1.js"  # Second file - replace with your actual filename
]

# Load data from both files
df = load_twitter_data(file_paths)

  df['created_at'] = pd.to_datetime(df['created_at'])


# Analyzing Tweet Data

## EDA

In [None]:
df.columns

Index(['edit_info', 'retweeted', 'source', 'entities', 'display_text_range',
       'favorite_count', 'id_str', 'truncated', 'retweet_count', 'id',
       'created_at', 'favorited', 'full_text', 'lang',
       'in_reply_to_status_id_str', 'in_reply_to_user_id',
       'in_reply_to_status_id', 'in_reply_to_screen_name',
       'in_reply_to_user_id_str', 'possibly_sensitive', 'extended_entities',
       'withheld_in_countries'],
      dtype='object')

In [None]:
df

Unnamed: 0,edit_info,retweeted,source,entities,display_text_range,favorite_count,id_str,truncated,retweet_count,id,...,full_text,lang,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_status_id,in_reply_to_screen_name,in_reply_to_user_id_str,possibly_sensitive,extended_entities,withheld_in_countries
0,{'initial': {'editTweetIds': ['185301969974484...,False,"<a href=""http://twitter.com/download/android"" ...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 71]",2,1853019699744845864,False,0,1853019699744845864,...,מתוך 11 מיילים של העבודה שיש לי עכשיו באינבוק...,iw,,,,,,,,
1,{'initial': {'editTweetIds': ['185301486845602...,False,"<a href=""http://twitter.com/download/android"" ...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 47]",1,1853014868456022237,False,0,1853014868456022237,...,@nimi220 התכוונתי לשנוץ\nהלוואי יכולתי לשנות קצת,iw,1853014278749462721,920125644679401473,1853014278749462721,nimi220,920125644679401473,,,
2,{'initial': {'editTweetIds': ['185301453849014...,False,"<a href=""http://twitter.com/download/android"" ...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 68]",1,1853014538490146965,False,0,1853014538490146965,...,@nimi220 זה מלווה במשכורת בדיחה\nולרוב אני קם ...,iw,1853014074679795954,920125644679401473,1853014074679795954,nimi220,920125644679401473,,,
3,{'initial': {'editTweetIds': ['185301287978550...,False,"<a href=""http://twitter.com/download/android"" ...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 35]",3,1853012879785508932,False,0,1853012879785508932,...,אני ליטרלי בדרך לעבודה ב 12 בצהריים,iw,1852935945089130882,776414940492099584,1852935945089130882,zekharia1234,776414940492099584,,,
4,{'initial': {'editTweetIds': ['185300984164985...,False,"<a href=""http://twitter.com/download/android"" ...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 47]",0,1853009841649856779,False,0,1853009841649856779,...,@Jibr1sh @Tal202020 לא חיבוק זה יותר חשוב מאוכל,iw,1852981651162144961,1610535458181713923,1852981651162144961,Jibr1sh,1610535458181713923,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68996,{'initial': {'editTweetIds': ['166870558082586...,False,"<a href=""http://twitter.com/download/android"" ...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 30]",1,1668705580825862145,False,0,1668705580825862145,...,@Johnathan_Sp ממש מותק של בחור,iw,1668704603473412096,2862158494,1668704603473412096,Johnathan_spn,2862158494,,,
68997,{'initial': {'editTweetIds': ['166870500410489...,False,"<a href=""http://twitter.com/download/android"" ...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 32]",1,1668705004104892416,False,0,1668705004104892416,...,@shimonh1988 הם מגיעים אליי לבד🥰,iw,1668704026983034886,1276206295784308742,1668704026983034886,shimonh1988,1276206295784308742,,,
68998,{'initial': {'editTweetIds': ['166870368828887...,False,"<a href=""http://twitter.com/download/android"" ...","{'hashtags': [], 'symbols': [], 'user_mentions...","[0, 38]",2,1668703688288870400,False,0,1668703688288870400,...,@EithanSchon @Datiya_Israelit מושלםםםם,iw,1668645715835252736,1230057406614450176,1668645715835252736,EithanSchon,1230057406614450176,,,
68999,{'initial': {'editTweetIds': ['166870350212877...,False,"<a href=""http://twitter.com/download/android"" ...","{'user_mentions': [], 'urls': [], 'symbols': [...","[0, 34]",29,1668703502128775174,False,0,1668703502128775174,...,איזה חמוד🥰 https://t.co/zhyTCQtAP1,iw,,,,,,False,{'media': [{'expanded_url': 'https://twitter.c...,


In [None]:
df.shape

(69001, 26)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69001 entries, 0 to 69000
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype              
---  ------                     --------------  -----              
 0   edit_info                  69001 non-null  object             
 1   retweeted                  69001 non-null  bool               
 2   source                     69001 non-null  object             
 3   entities                   69001 non-null  object             
 4   display_text_range         69001 non-null  object             
 5   favorite_count             69001 non-null  int64              
 6   id_str                     69001 non-null  object             
 7   truncated                  69001 non-null  bool               
 8   retweet_count              69001 non-null  int64              
 9   id                         69001 non-null  object             
 10  created_at                 69001 non-null  datetime64[ns, UTC]
 11  fa

## Yearly Distribution of Tweets and Likes

In [None]:
# Extract year and create yearly statistics
yearly_stats = df.groupby(df['created_at'].dt.year).agg({
    'full_text': 'count',  # Count of tweets
    'favorite_count': 'sum'  # Sum of likes
}).reset_index()

yearly_stats.columns = ['year', 'tweet_count', 'total_likes']

# Function to format large numbers
def format_number(n):
    n = float(n)  # Convert to float first
    if n >= 1_000_000:
        return f'{n/1_000_000:.1f}M'
    elif n >= 1_000:
        return f'{n/1_000:.1f}K'
    else:
        return str(int(n))

# Create subplots
fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=('Total Tweets per Year', 'Total Likes per Year'),
    vertical_spacing=0.15
)

# Add bar for tweet counts
fig.add_trace(
    go.Bar(
        x=yearly_stats['year'],
        y=yearly_stats['tweet_count'],
        text=yearly_stats['tweet_count'].apply(format_number),
        textposition='auto',
        name='Tweets',
        marker_color='orange',
        hovertemplate="Year: %{x}<br>" +
                     "Tweets: %{y:,.0f}<br>" +
                     "<extra></extra>"
    ),
    row=1, col=1
)

# Add bar for like counts
fig.add_trace(
    go.Bar(
        x=yearly_stats['year'],
        y=yearly_stats['total_likes'],
        text=yearly_stats['total_likes'].apply(format_number),
        textposition='auto',
        name='Likes',
        marker_color='darkred',
        hovertemplate="Year: %{x}<br>" +
                     "Likes: %{y:,.0f}<br>" +
                     "<extra></extra>"
    ),
    row=2, col=1
)

# Update layout
fig.update_layout(
    title=dict(
        text="Yearly Distribution of Tweets and Likes",
        x=0.5,
        font=dict(size=20)
    ),
    showlegend=False,
    width=1000,
    height=800,
    plot_bgcolor='white'
)

# Update axes
for i in range(1, 3):
    fig.update_yaxes(
        title_text="Count" if i == 1 else "Likes",
        row=i, col=1,
        gridcolor='lightgray',
        showgrid=True,
        tickformat=",d"
    )
    fig.update_xaxes(
        title_text="Year",
        row=i, col=1,
        tickmode='linear'
    )

# Show plot
fig.show()

# Print detailed statistics
print("\nYearly Statistics:")
print("\nYear    Tweets    Likes     Avg Likes/Tweet")
print("-" * 45)
for _, row in yearly_stats.iterrows():
    avg_likes = float(row['total_likes']) / float(row['tweet_count'])
    print(f"{int(row['year'])}    {format_number(row['tweet_count'])}    {format_number(row['total_likes'])}    {avg_likes:.1f}")

# Calculate year-over-year changes
yearly_stats['tweet_growth'] = yearly_stats['tweet_count'].astype(float).pct_change() * 100
yearly_stats['likes_growth'] = yearly_stats['total_likes'].astype(float).pct_change() * 100

print("\nYear-over-Year Changes:")
print("\nYear    Tweet Growth    Likes Growth")
print("-" * 40)
for _, row in yearly_stats.iterrows():
    if not pd.isna(row['tweet_growth']):
        print(f"{int(row['year'])}    {row['tweet_growth']:>+8.1f}%    {row['likes_growth']:>+8.1f}%")

# Peak statistics
most_tweets_year = yearly_stats.loc[yearly_stats['tweet_count'].astype(float).idxmax()]
most_likes_year = yearly_stats.loc[yearly_stats['total_likes'].astype(float).idxmax()]

print("\nPeak Statistics:")
print(f"Most active year: {int(most_tweets_year['year'])} with {format_number(most_tweets_year['tweet_count'])} tweets")
print(f"Most liked year: {int(most_likes_year['year'])} with {format_number(most_likes_year['total_likes'])} likes")


Yearly Statistics:

Year    Tweets    Likes     Avg Likes/Tweet
---------------------------------------------
2016    497    26    0.1
2017    59    0    0.0
2018    59    0    0.0
2019    31    1    0.0
2020    763    254    0.3
2021    28.0K    51.2K    1.8
2022    14.9K    33.8K    2.3
2023    11.9K    41.7K    3.5
2024    12.8K    59.1K    4.6

Year-over-Year Changes:

Year    Tweet Growth    Likes Growth
----------------------------------------
2017       -88.1%      -100.0%
2018        +0.0%        +nan%
2019       -47.5%        +inf%
2020     +2361.3%    +25300.0%
2021     +3564.5%    +20040.9%
2022       -46.5%       -33.9%
2023       -20.5%       +23.1%
2024        +7.7%       +41.8%

Peak Statistics:
Most active year: 2021 with 28.0K tweets
Most liked year: 2024 with 59.1K likes


## Traffic per Tweet

In [None]:
df['favorite_count']=df['favorite_count'].astype(int)
value_counts=df['favorite_count'].value_counts().sort_index()
fig=px.bar(value_counts,x=value_counts.index,y=value_counts.values)
fig.update_layout(xaxis_title='Number of Likes',yaxis_title='Number of tweets',title='Likes per Tweet')
fig.show()

In [None]:
df['retweet_count']=df['retweet_count'].astype(int)
value_counts=df['retweet_count'].value_counts().sort_index()
fig=px.bar(value_counts,x=value_counts.index,y=value_counts.values)
fig.update_layout(xaxis_title='Number of Retweets',yaxis_title='Number of tweets',title='Retweets per Tweet')
fig.show()

## Most Common Replies

In [25]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import re

# Function to extract username from tweet text (from replies)
def extract_username_from_reply(text):
   if isinstance(text, str):
       # Look for @username at the start of the tweet
       match = re.match(r'^@(\w+)', text)
       return match.group(1) if match else None
   return None

# Add username column to likes_df
likes_df['username'] = likes_df['fullText'].apply(extract_username_from_reply)

# Filter out specified username and get top 20
filtered_likes = likes_df[likes_df['username'] != 'zekharia1234']
filtered_likes = filtered_likes[filtered_likes['username'].notna()]
top_users = filtered_likes['username'].value_counts().head(20)

# Create visualization
fig = go.Figure()

# Add bar trace
fig.add_trace(
   go.Bar(
       x=top_users.index,
       y=top_users.values,
       text=top_users.values,
       textposition='auto',
       marker_color='orange',
       hovertemplate="User: @%{x}<br>" +
                    "Likes: %{y}<br>" +
                    "<extra></extra>"
   )
)

# Update layout
fig.update_layout(
   title={
       'text': "Top 20 Most Liked Users in Replies",
       'x': 0.5,
       'xanchor': 'center',
       'font': {'size': 20}
   },
   xaxis_title={
       'text': "Username",
       'font': {'size': 14}
   },
   yaxis_title={
       'text': "Number of Likes",
       'font': {'size': 14}
   },
   xaxis_tickangle=45,
   height=600,
   width=1000,
   showlegend=False,
   plot_bgcolor='white',
   yaxis=dict(
       gridcolor='lightgray',
       zeroline=True,
       zerolinecolor='lightgray'
   )
)

fig.show()

# Print statistics
print("\nReply Likes Analysis:")
print(f"Total likes analyzed: {len(filtered_likes)}")
print(f"Number of unique users: {len(top_users)}")
print(f"Top 20 users account for {(top_users.sum() / len(filtered_likes) * 100):.1f}% of analyzed likes")

# Show percentage of your tweets that are replies
total_tweets = len(df)
reply_percentage = (len(replies) / total_tweets) * 100
print(f"\nPercentage of tweets that are replies: {reply_percentage:.1f}%")

reply_counts = replies['in_reply_to_screen_name'].value_counts()

# Some additional statistics
print("\nReply Statistics:")
print(f"Total replies: {len(replies)}")
print(f"Unique users replied to: {len(reply_counts)}")
print(f"Average replies per user: {len(replies) / len(reply_counts):.2f}")



Reply Likes Analysis:
Total likes analyzed: 217068
Number of unique users: 20
Top 20 users account for 18.4% of analyzed likes

Percentage of tweets that are replies: 78.8%

Reply Statistics:
Total replies: 54381
Unique users replied to: 3708
Average replies per user: 14.67


In [None]:
# Filter for replies and ensure created_at is datetime
replies = df[df['in_reply_to_screen_name'].notna()].copy()
replies['year'] = replies['created_at'].dt.year

# Group by year and username, count replies
yearly_replies = replies.groupby(['year', 'in_reply_to_screen_name']).size().reset_index(name='reply_count')

# Get unique years
years = sorted(yearly_replies['year'].unique())
num_years = len(years)

# Create subplots - one for each year
fig = make_subplots(
    rows=num_years, cols=1,
    subplot_titles=[f"Top 10 Users Replied to in {year}" for year in years],
    vertical_spacing=0.1
)

# Define color scale - from light orange to dark red
def get_color_scale(n):
    """Generate n colors from light orange to dark red"""
    colors = []
    for i in range(n):
        # Calculate color based on position
        r = int(255 - (i * (255 - 183) / (n-1)))  # From 255 to 183
        g = int(183 - (i * 183 / (n-1)))          # From 183 to 0
        b = int(77 - (i * 77 / (n-1)))            # From 77 to 0
        colors.append(f'rgb({r},{g},{b})')
    return colors

# For each year
for i, year in enumerate(years, 1):
    year_data = yearly_replies[yearly_replies['year'] == year]
    top_users = year_data.nlargest(10, 'reply_count')

    # Sort by reply count and get colors
    top_users = top_users.sort_values('reply_count', ascending=True)
    colors = get_color_scale(len(top_users))

    fig.add_trace(
        go.Bar(
            x=top_users['in_reply_to_screen_name'],
            y=top_users['reply_count'],
            name=str(year),
            marker_color=colors,
            text=top_users['reply_count'],
            textposition='auto',
            hovertemplate="User: %{x}<br>" +
                         "Replies: %{y}<br>" +
                         "<extra></extra>"
        ),
        row=i, col=1
    )

    # Update layout for each subplot
    fig.update_xaxes(tickangle=45, row=i, col=1)
    fig.update_yaxes(title_text="Number of Replies", row=i, col=1)

# Update overall layout
fig.update_layout(
    height=300 * num_years,  # Adjust height based on number of years
    width=1000,
    showlegend=False,
    title_text="Yearly Reply Patterns",
    title_x=0.5,
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(size=12)
)

# Update axes lines
fig.update_xaxes(showgrid=False, showline=True, linewidth=1, linecolor='lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray',
                 showline=True, linewidth=1, linecolor='lightgray')

# Show interactive plot
fig.show()

# Print statistics
print("\nYearly Reply Statistics:")
for year in years:
    year_data = yearly_replies[yearly_replies['year'] == year]
    total_replies = year_data['reply_count'].sum()
    unique_users = len(year_data)
    print(f"\n=== {year} ===")
    print(f"Total replies: {total_replies}")
    print(f"Unique users replied to: {unique_users}")
    print(f"Average replies per user: {total_replies/unique_users:.2f}")


Yearly Reply Statistics:

=== 2016 ===
Total replies: 208
Unique users replied to: 71
Average replies per user: 2.93

=== 2019 ===
Total replies: 7
Unique users replied to: 4
Average replies per user: 1.75

=== 2020 ===
Total replies: 474
Unique users replied to: 237
Average replies per user: 2.00

=== 2021 ===
Total replies: 22171
Unique users replied to: 1759
Average replies per user: 12.60

=== 2022 ===
Total replies: 12926
Unique users replied to: 932
Average replies per user: 13.87

=== 2023 ===
Total replies: 9033
Unique users replied to: 1173
Average replies per user: 7.70

=== 2024 ===
Total replies: 9562
Unique users replied to: 1146
Average replies per user: 8.34


In [21]:
# Filter for replies and ensure created_at is datetime
replies = df[df['in_reply_to_screen_name'].notna()].copy()
replies['year'] = replies['created_at'].dt.year

# Group by year and username, count replies
yearly_replies = replies.groupby(['year', 'in_reply_to_screen_name']).size().reset_index(name='reply_count')

# Get unique years
years = sorted(yearly_replies['year'].unique())
num_years = len(years)

# Create subplots - one for each year
fig = make_subplots(
   rows=num_years, cols=1,
   subplot_titles=[f"Top 10 Users Replied to in {year}" for year in years],
   vertical_spacing=0.1
)

# Define color scale - from light orange to dark red
def get_color_scale(n):
   """Generate n colors from light orange to dark red"""
   colors = []
   for i in range(n):
       # Calculate color based on position
       r = int(255 - (i * (255 - 183) / (n-1)))  # From 255 to 183
       g = int(183 - (i * 183 / (n-1)))          # From 183 to 0
       b = int(77 - (i * 77 / (n-1)))            # From 77 to 0
       colors.append(f'rgb({r},{g},{b})')
   return colors

# For each year
for i, year in enumerate(years, 1):
   year_data = yearly_replies[yearly_replies['year'] == year]
   # Filter out zekharia1234 and get top 10
   year_data_filtered = year_data[year_data['in_reply_to_screen_name'] != 'zekharia1234']
   top_users = year_data_filtered.nlargest(10, 'reply_count')

   # Sort by reply count and get colors
   top_users = top_users.sort_values('reply_count', ascending=True)
   colors = get_color_scale(len(top_users))

   fig.add_trace(
       go.Bar(
           x=top_users['in_reply_to_screen_name'],
           y=top_users['reply_count'],
           name=str(year),
           marker_color=colors,
           text=top_users['reply_count'],
           textposition='auto',
           hovertemplate="User: %{x}<br>" +
                        "Replies: %{y}<br>" +
                        "<extra></extra>"
       ),
       row=i, col=1
   )

   # Update layout for each subplot
   fig.update_xaxes(tickangle=45, row=i, col=1)
   fig.update_yaxes(title_text="Number of Replies", row=i, col=1)

# Update overall layout
fig.update_layout(
   height=300 * num_years,  # Adjust height based on number of years
   width=1000,
   showlegend=False,
   title_text="Yearly Reply Patterns",
   title_x=0.5,
   plot_bgcolor='white',
   paper_bgcolor='white',
   font=dict(size=12)
)

# Update axes lines
fig.update_xaxes(showgrid=False, showline=True, linewidth=1, linecolor='lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray',
                showline=True, linewidth=1, linecolor='lightgray')

# Show interactive plot
fig.show()

# Print statistics
print("\nYearly Reply Statistics:")
for year in years:
   year_data = yearly_replies[yearly_replies['year'] == year]
   total_replies = year_data['reply_count'].sum()
   unique_users = len(year_data)
   print(f"\n=== {year} ===")
   print(f"Total replies: {total_replies}")
   print(f"Unique users replied to: {unique_users}")
   print(f"Average replies per user: {total_replies/unique_users:.2f}")


Yearly Reply Statistics:

=== 2016 ===
Total replies: 208
Unique users replied to: 71
Average replies per user: 2.93

=== 2019 ===
Total replies: 7
Unique users replied to: 4
Average replies per user: 1.75

=== 2020 ===
Total replies: 474
Unique users replied to: 237
Average replies per user: 2.00

=== 2021 ===
Total replies: 22171
Unique users replied to: 1759
Average replies per user: 12.60

=== 2022 ===
Total replies: 12926
Unique users replied to: 932
Average replies per user: 13.87

=== 2023 ===
Total replies: 9033
Unique users replied to: 1173
Average replies per user: 7.70

=== 2024 ===
Total replies: 9562
Unique users replied to: 1146
Average replies per user: 8.34


## Tweet Media Analysis

In [None]:
# Check if tweet has media using extended_entities column
df['has_media'] = df['extended_entities'].notna()

# Create year and month columns for time series analysis
df['year_month'] = df['created_at'].dt.strftime('%Y-%m')

# Group by year_month and media presence
monthly_comparison = df.groupby(['year_month', 'has_media']).size().unstack(fill_value=0)

# Calculate engagement metrics
engagement_comparison = df.groupby('has_media').agg({
    'favorite_count': ['count', 'mean', 'median', 'max'],
    'retweet_count': ['count', 'mean', 'median', 'max']
}).round(2)

# Create subplots
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        "Monthly Tweet Volume Comparison",
        "Average Engagement per Tweet Type",
        "Tweet Distribution",
        "Engagement Box Plot"
    ),
    specs=[[{"type": "scatter"}, {"type": "bar"}],
           [{"type": "pie"}, {"type": "box"}]]
)

# 1. Time series plot
fig.add_trace(
    go.Scatter(
        x=monthly_comparison.index,
        y=monthly_comparison[True],
        name="With Media",
        line=dict(color='#ff7f0e')
    ),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(
        x=monthly_comparison.index,
        y=monthly_comparison[False],
        name="Without Media",
        line=dict(color='#1f77b4')
    ),
    row=1, col=1
)

# 2. Average engagement bar plot
engagement_data = df.groupby('has_media').agg({
    'favorite_count': 'mean',
    'retweet_count': 'mean'
}).round(2)

fig.add_trace(
    go.Bar(
        x=['With Media', 'Without Media'],
        y=engagement_data['favorite_count'],
        name="Avg Likes",
        marker_color='#ff7f0e'
    ),
    row=1, col=2
)

fig.add_trace(
    go.Bar(
        x=['With Media', 'Without Media'],
        y=engagement_data['retweet_count'],
        name="Avg Retweets",
        marker_color='#1f77b4'
    ),
    row=1, col=2
)

# 3. Overall distribution pie chart
total_counts = df['has_media'].value_counts()
fig.add_trace(
    go.Pie(
        labels=['Without Media', 'With Media'],
        values=[total_counts[False], total_counts[True]],
        marker=dict(colors=['#1f77b4', '#ff7f0e'])
    ),
    row=2, col=1
)

# 4. Engagement box plot
fig.add_trace(
    go.Box(
        y=df[df['has_media']]['favorite_count'],
        name="With Media Likes",
        marker_color='#ff7f0e'
    ),
    row=2, col=2
)

fig.add_trace(
    go.Box(
        y=df[~df['has_media']]['favorite_count'],
        name="Without Media Likes",
        marker_color='#1f77b4'
    ),
    row=2, col=2
)

# Update layout
fig.update_layout(
    height=800,
    width=1200,
    showlegend=True,
    title_text="Tweet Media Analysis",
    title_x=0.5,
    template="simple_white"
)

# Update x-axis for time series
fig.update_xaxes(tickangle=45, row=1, col=1)

# Show plot
fig.show()

# Print detailed statistics
print("\nDetailed Statistics:")
print("\n1. Overall Distribution:")
print(df['has_media'].value_counts(normalize=True).multiply(100).round(2).to_frame('percentage'))

print("\n2. Engagement Statistics:")
print("\nTweets with Media:")
print(engagement_comparison.loc[True].round(2))
print("\nTweets without Media:")
print(engagement_comparison.loc[False].round(2))

print("\n3. Top Performing Tweets:")
print("\nMost Liked Tweet with Media:")
most_liked_media = df[df['has_media']].nlargest(1, 'favorite_count')
print(f"Likes: {most_liked_media['favorite_count'].values[0]}")
print(f"Text: {most_liked_media['full_text'].values[0]}")

print("\nMost Liked Tweet without Media:")
most_liked_no_media = df[~df['has_media']].nlargest(1, 'favorite_count')
print(f"Likes: {most_liked_no_media['favorite_count'].values[0]}")
print(f"Text: {most_liked_no_media['full_text'].values[0]}")


Detailed Statistics:

1. Overall Distribution:
           percentage
has_media            
False           93.02
True             6.98

2. Engagement Statistics:

Tweets with Media:
favorite_count  count     4813.00
                mean        11.11
                median       4.00
                max        994.00
retweet_count   count     4813.00
                mean         0.04
                median       0.00
                max          9.00
Name: True, dtype: float64

Tweets without Media:
favorite_count  count     64188.00
                mean          2.07
                median        1.00
                max         819.00
retweet_count   count     64188.00
                mean          0.01
                median        0.00
                max          32.00
Name: False, dtype: float64

3. Top Performing Tweets:

Most Liked Tweet with Media:
Likes: 994
Text: כללית עושה לי פאסיב אגרסיב וואלה יופי https://t.co/3iMUyVQ3mS

Most Liked Tweet without Media:
Likes: 819
Text: י

## Tweets by Weekday and Hour

In [None]:
# Extract hour and day of week
df['hour'] = df['created_at'].dt.hour
df['day_of_week'] = df['created_at'].dt.day_name()

# Create day order from Sunday to Saturday
day_order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']

# Create pivot table for heatmap
heatmap_data = pd.crosstab(df['day_of_week'], df['hour'])

# Reorder days
heatmap_data = heatmap_data.reindex(day_order)

# Create heatmap
fig = go.Figure(data=go.Heatmap(
   z=heatmap_data.values,
   x=heatmap_data.columns,
   y=heatmap_data.index,
   colorscale='Oranges',
   text=heatmap_data.values,
   texttemplate="%{text}",
   textfont={"size": 12},
   colorbar=dict(
       title="Number of Tweets",
       titleside="right",
       titlefont=dict(size=14)
   ),
   hoverongaps=False,
   hovertemplate="Day: %{y}<br>Hour: %{x}:00<br>Tweets: %{text}<extra></extra>"
))

# Update layout for better visibility
fig.update_layout(
   title=dict(
       text="Tweet Traffic Heatmap by Day and Hour",
       x=0.5,
       font=dict(size=20)
   ),
   width=1200,
   height=600,
   xaxis=dict(
       title="Hour of Day",
       ticktext=[f"{i:02d}:00" for i in range(24)],
       tickvals=list(range(24)),
       tickfont=dict(size=12),
       title_font=dict(size=14)
   ),
   yaxis=dict(
       title="Day of Week",
       tickfont=dict(size=12),
       title_font=dict(size=14),
       autorange="reversed"  # This keeps Sunday at the top
   ),
   plot_bgcolor='white'
)

# Show the plot
fig.show()

# Print some additional statistics
print("\nTraffic Statistics by Day:")
for day in day_order:  # Using new day order
   day_count = len(df[df['day_of_week'] == day])
   busiest_hour = df[df['day_of_week'] == day]['hour'].mode().iloc[0]
   hour_count = len(df[(df['day_of_week'] == day) & (df['hour'] == busiest_hour)])
   print(f"\n{day}:")
   print(f"Total tweets: {day_count}")
   print(f"Busiest hour: {busiest_hour:02d}:00 ({hour_count} tweets)")

   # Calculate peak periods
   day_hours = df[df['day_of_week'] == day]['hour'].value_counts().sort_index()
   morning = day_hours[6:12].sum()
   afternoon = day_hours[12:18].sum()
   evening = day_hours[18:24].sum()
   night = day_hours[0:6].sum()

   print(f"Morning (6-12): {morning} tweets")
   print(f"Afternoon (12-18): {afternoon} tweets")
   print(f"Evening (18-24): {evening} tweets")
   print(f"Night (0-6): {night} tweets")

# Overall peak hours
print("\nOverall Peak Hours:")
peak_hours = df['hour'].value_counts().sort_values(ascending=False).head(5)
for hour, count in peak_hours.items():
   print(f"{hour:02d}:00 - {count} tweets")


Traffic Statistics by Day:

Sunday:
Total tweets: 12115
Busiest hour: 10:00 (783 tweets)
Morning (6-12): 3812 tweets
Afternoon (12-18): 3847 tweets
Evening (18-24): 3137 tweets
Night (0-6): 1319 tweets

Monday:
Total tweets: 10665
Busiest hour: 17:00 (733 tweets)
Morning (6-12): 2882 tweets
Afternoon (12-18): 3882 tweets
Evening (18-24): 3325 tweets
Night (0-6): 576 tweets

Tuesday:
Total tweets: 10720
Busiest hour: 10:00 (707 tweets)
Morning (6-12): 3379 tweets
Afternoon (12-18): 3476 tweets
Evening (18-24): 3145 tweets
Night (0-6): 720 tweets

Wednesday:
Total tweets: 10799
Busiest hour: 20:00 (788 tweets)
Morning (6-12): 3168 tweets
Afternoon (12-18): 3550 tweets
Evening (18-24): 3571 tweets
Night (0-6): 510 tweets

Thursday:
Total tweets: 11150
Busiest hour: 21:00 (722 tweets)
Morning (6-12): 3016 tweets
Afternoon (12-18): 3680 tweets
Evening (18-24): 3724 tweets
Night (0-6): 730 tweets

Friday:
Total tweets: 6689
Busiest hour: 13:00 (870 tweets)
Morning (6-12): 3218 tweets
Aftern

In [None]:
# Extract hour and day of week
df['hour'] = df['created_at'].dt.hour
df['day_of_week'] = df['created_at'].dt.day_name()

# Create day order from Sunday to Saturday
day_order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']

# Create pivot table for heatmap using sum of favorite_count
heatmap_data = df.pivot_table(
    values='favorite_count',
    index='day_of_week',
    columns='hour',
    aggfunc='sum',
    fill_value=0  # Fill NaN values with 0
).round(0)

# Reorder days
heatmap_data = heatmap_data.reindex(day_order)

# Calculate tweet counts
count_data = pd.crosstab(df['day_of_week'], df['hour'])
count_data = count_data.reindex(day_order)

# Create custom hover text
hover_text = []
for day in day_order:
    day_hover = []
    for hour in range(24):
        total_likes = heatmap_data.loc[day, hour]
        tweet_count = count_data.loc[day, hour]
        avg_likes = total_likes / tweet_count if tweet_count > 0 else 0
        day_hover.append(
            f"Day: {day}<br>" +
            f"Hour: {hour:02d}:00<br>" +
            f"Total Likes: {int(total_likes)}<br>" +
            f"Tweet Count: {tweet_count}<br>" +
            f"Avg Likes/Tweet: {avg_likes:.1f}"
        )
    hover_text.append(day_hover)

# Create heatmap
fig = go.Figure(data=go.Heatmap(
    z=heatmap_data.values,
    x=heatmap_data.columns,
    y=heatmap_data.index,
    colorscale='Oranges',
    text=heatmap_data.values,
    texttemplate="%{text:.0f}",  # Format without decimal places
    textfont={"size": 12},
    colorbar=dict(
        title="Total Likes",
        titleside="right",
        titlefont=dict(size=14)
    ),
    hovertext=hover_text,
    hoverinfo='text'
))

# Update layout for better visibility
fig.update_layout(
    title=dict(
        text="Total Likes Heatmap by Day and Hour",
        x=0.5,
        font=dict(size=20)
    ),
    width=1200,
    height=600,
    xaxis=dict(
        title="Hour of Day",
        ticktext=[f"{i:02d}:00" for i in range(24)],
        tickvals=list(range(24)),
        tickfont=dict(size=12),
        title_font=dict(size=14)
    ),
    yaxis=dict(
        title="Day of Week",
        tickfont=dict(size=12),
        title_font=dict(size=14),
        autorange="reversed"  # This keeps Sunday at the top
    ),
    plot_bgcolor='white'
)

# Show the plot
fig.show()

# Print additional statistics
print("\nLikes Statistics by Day:")
for day in day_order:
    day_data = df[df['day_of_week'] == day]

    total_likes = day_data['favorite_count'].sum()
    tweet_count = len(day_data)
    avg_likes = total_likes / tweet_count if tweet_count > 0 else 0

    print(f"\n{day}:")
    print(f"Total likes: {int(total_likes):,}")
    print(f"Tweet count: {tweet_count:,}")
    print(f"Average likes per tweet: {avg_likes:.1f}")

    # Time period analysis
    morning_data = day_data[day_data['hour'].between(6, 11)]
    afternoon_data = day_data[day_data['hour'].between(12, 17)]
    evening_data = day_data[day_data['hour'].between(18, 23)]
    night_data = day_data[day_data['hour'].between(0, 5)]

    print("\nLikes by time period:")
    print(f"Morning (6-12): {int(morning_data['favorite_count'].sum()):,} likes ({len(morning_data)} tweets)")
    print(f"Afternoon (12-18): {int(afternoon_data['favorite_count'].sum()):,} likes ({len(afternoon_data)} tweets)")
    print(f"Evening (18-24): {int(evening_data['favorite_count'].sum()):,} likes ({len(evening_data)} tweets)")
    print(f"Night (0-6): {int(night_data['favorite_count'].sum()):,} likes ({len(night_data)} tweets)")

# Overall best performing hours
print("\nBest Performing Hours (Total Likes):")
hourly_stats = df.groupby('hour').agg({
    'favorite_count': ['sum', 'size', 'mean']
}).round(1)
hourly_stats.columns = ['total_likes', 'tweet_count', 'avg_likes']
hourly_stats = hourly_stats.sort_values('total_likes', ascending=False)

for hour, row in hourly_stats.head(5).iterrows():
    print(f"{hour:02d}:00 - {int(row['total_likes']):,} total likes "
          f"({int(row['tweet_count']):,} tweets, "
          f"{row['avg_likes']:.1f} avg likes/tweet)")


Likes Statistics by Day:

Sunday:
Total likes: 34,605
Tweet count: 12,115
Average likes per tweet: 2.9

Likes by time period:
Morning (6-12): 11,314 likes (3812 tweets)
Afternoon (12-18): 11,982 likes (3847 tweets)
Evening (18-24): 7,603 likes (3137 tweets)
Night (0-6): 3,706 likes (1319 tweets)

Monday:
Total likes: 28,857
Tweet count: 10,665
Average likes per tweet: 2.7

Likes by time period:
Morning (6-12): 8,145 likes (2882 tweets)
Afternoon (12-18): 11,135 likes (3882 tweets)
Evening (18-24): 8,262 likes (3325 tweets)
Night (0-6): 1,315 likes (576 tweets)

Tuesday:
Total likes: 29,136
Tweet count: 10,720
Average likes per tweet: 2.7

Likes by time period:
Morning (6-12): 9,572 likes (3379 tweets)
Afternoon (12-18): 10,291 likes (3476 tweets)
Evening (18-24): 7,570 likes (3145 tweets)
Night (0-6): 1,703 likes (720 tweets)

Wednesday:
Total likes: 29,759
Tweet count: 10,799
Average likes per tweet: 2.8

Likes by time period:
Morning (6-12): 9,584 likes (3168 tweets)
Afternoon (12-1

## Most Common Emojis

In [None]:
def extract_emojis(text):
    """Extract all emojis from text"""
    if isinstance(text, str):
        return ''.join(c for c in text if c in emoji.EMOJI_DATA)
    return ''

# Extract emojis from tweets
df['emojis'] = df['full_text'].apply(extract_emojis)

# Count all emojis
all_emojis = ''.join(df['emojis'].dropna())
emoji_counts = Counter(all_emojis)

# Get top 20 emojis and their counts
top_emojis = dict(sorted(emoji_counts.items(), key=lambda x: x[1], reverse=True)[:20])

# Create bar chart
fig = go.Figure(data=[
    go.Bar(
        x=list(top_emojis.keys()),
        y=list(top_emojis.values()),
        text=list(top_emojis.values()),
        textposition='auto',
        marker_color='orange'
    )
])

# Update layout
fig.update_layout(
    title={
        'text': 'Top 20 Most Used Emojis',
        'x': 0.5,
        'font': {'size': 20}
    },
    xaxis_title="Emoji",
    yaxis_title="Count",
    width=1000,
    height=500,
    yaxis=dict(
        gridcolor='lightgray'
    ),
    plot_bgcolor='white'
)

# Show plot
fig.show()

# Print detailed statistics
print("\nEmoji Statistics:")
print(f"Total emojis used: {len(all_emojis):,}")
print(f"Unique emojis used: {len(emoji_counts):,}")
print(f"Tweets with emojis: {len(df[df['emojis'].str.len() > 0]):,}")
print(f"Percentage of tweets with emojis: {(len(df[df['emojis'].str.len() > 0]) / len(df) * 100):.1f}%")

print("\nTop 20 Emojis with Counts:")
for emoji_char, count in top_emojis.items():
    print(f"{emoji_char}: {count:,} times")

# Emoji usage over time
df['year_month'] = df['created_at'].dt.strftime('%Y-%m')
monthly_emoji_counts = df[df['emojis'].str.len() > 0].groupby('year_month').size()

# Create time series plot
fig2 = go.Figure(data=[
    go.Scatter(
        x=monthly_emoji_counts.index,
        y=monthly_emoji_counts.values,
        mode='lines+markers',
        line=dict(color='orange'),
        name='Tweets with Emojis'
    )
])

fig2.update_layout(
    title={
        'text': 'Emoji Usage Over Time',
        'x': 0.5,
        'font': {'size': 20}
    },
    xaxis_title="Month",
    yaxis_title="Number of Tweets with Emojis",
    width=1000,
    height=400,
    xaxis=dict(tickangle=45),
    yaxis=dict(gridcolor='lightgray'),
    plot_bgcolor='white'
)

fig2.show()

# Print time-based statistics
print("\nEmoji Usage Patterns:")
# By year
df['year'] = df['created_at'].dt.year
yearly_stats = df.groupby('year').agg({
    'emojis': [
        ('tweets_with_emojis', lambda x: sum(x.str.len() > 0)),
        ('total_tweets', 'size')
    ]
}).round(2)
yearly_stats.columns = yearly_stats.columns.droplevel()

print("\nYearly Emoji Usage:")
for year, row in yearly_stats.iterrows():
    percentage = (row['tweets_with_emojis'] / row['total_tweets'] * 100)
    print(f"{year}: {row['tweets_with_emojis']:,} tweets with emojis out of {row['total_tweets']:,} total tweets ({percentage:.1f}%)")


Emoji Statistics:
Total emojis used: 30,428
Unique emojis used: 267
Tweets with emojis: 7,574
Percentage of tweets with emojis: 11.0%

Top 20 Emojis with Counts:
😂: 5,926 times
😭: 4,464 times
😳: 3,757 times
🤣: 1,859 times
🥳: 1,759 times
🤦: 1,153 times
♂: 1,092 times
❤: 842 times
🤬: 757 times
👍: 652 times
🤩: 624 times
🙏: 513 times
😉: 468 times
💔: 352 times
😍: 333 times
😡: 331 times
🚩: 267 times
🤮: 246 times
🙄: 207 times
😅: 181 times



Emoji Usage Patterns:

Yearly Emoji Usage:
2016: 6 tweets with emojis out of 497 total tweets (1.2%)
2017: 1 tweets with emojis out of 59 total tweets (1.7%)
2018: 0 tweets with emojis out of 59 total tweets (0.0%)
2019: 1 tweets with emojis out of 31 total tweets (3.2%)
2020: 76 tweets with emojis out of 763 total tweets (10.0%)
2021: 1,994 tweets with emojis out of 27,960 total tweets (7.1%)
2022: 2,153 tweets with emojis out of 14,945 total tweets (14.4%)
2023: 1,971 tweets with emojis out of 11,888 total tweets (16.6%)
2024: 1,372 tweets with emojis out of 12,799 total tweets (10.7%)


## Tweets by Language

In [None]:
df.lang.value_counts()

Unnamed: 0_level_0,count
lang,Unnamed: 1_level_1
iw,64574
qme,2694
und,547
en,449
ru,298
qam,239
zxx,65
art,21
in,17
ro,17
