In [None]:
import pandas as pd
import re
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

features_df = pd.read_csv('Dataset/Features_For_Traditional_ML_Techniques.csv')
processed_df = pd.read_json('Processed/processed_all.json')

In [None]:
print(len(features_df))
print(len(processed_df))

In [None]:
df = pd.merge(features_df, processed_df, left_index=True, right_index=True)

In [None]:
df.columns

In [None]:
df = df[['statement_x', 'BinaryNumTarget', 'followers_count', 'friends_count', 'favourites_count', 'statuses_count', 'listed_count', 'following', 'mentions', 'quotes', 'replies', 'retweets', 'favourites', 'hashtags', 'URLs', 'unique_count', 'total_count', 'Word count', 'Max word length', 'Min word length', 'Average word length', 'present_verbs', 'past_verbs', 'adjectives', 'adverbs', 'adpositions', 'pronouns', 'TOs', 'determiners', 'conjunctions', 'dots', 'exclamation', 'questions', 'ampersand', 'capitals', 'digits', 'long_word_freq', 'short_word_freq', 'tweet_y', 'keywords', 'target_binary', 'target_3', 'target_5']]
df

In [None]:
df["target_binary"].value_counts()
df["target_3"].value_counts()
df["target_5"].value_counts()

In [None]:
# Create 1x3 subplot layout
fig = make_subplots(rows=1, cols=3, specs=[[{'type':'pie'}, {'type':'pie'}, {'type':'pie'}]])

fig.add_trace(go.Pie(labels=df['target_binary'].value_counts().index,
                     values=df['target_binary'].value_counts().values,
                     name='target_binary'), row=1, col=1)

fig.add_trace(go.Pie(labels=df['target_3'].value_counts().index,
                     values=df['target_3'].value_counts().values,
                     name='target_3'), row=1, col=2)

fig.add_trace(go.Pie(labels=df['target_5'].value_counts().index,
                     values=df['target_5'].value_counts().values,
                     name='target_5'), row=1, col=3)

# Improve Style/Layout
fig.update_layout(
    annotations=[
        dict(
            text='Distribution of Binary and Multiclass <i>(3 and 5 Label)</i> Target',
            showarrow=False,
            xref='paper',
            yref='paper',
            x=0.5,  
            y=1.15,  
            xanchor='center',
            yanchor='bottom',
            font=dict(
                size=16  
            )
        )
    ],
    width=800,
    height=350,
)

fig.show()

In [None]:
# Grouping Aggregate Function
def create_grouped_df(df, group_column, columns_to_average, decimals=2):
    grouped_dataframes = []
    
    for column in columns_to_average:
        grouped_df = df.groupby(group_column)[column].mean().reset_index()
        grouped_df[column] = grouped_df[column].round(decimals)
        grouped_dataframes.append(grouped_df)
    
    df = pd.DataFrame()
    for i, grouped_df in enumerate(grouped_dataframes):
        if i == 0:
            df = grouped_df
        else:
            df = df.merge(grouped_df, on=group_column)
    
    return df

In [None]:
engagement_df = create_grouped_df(df, 'target_binary', ['favourites', 'replies', 'retweets', 'quotes'])
engagement_df

In [None]:
# Engagement Metric Distibutions
engagement_df = create_grouped_df(df, 'target_binary', ['favourites', 'replies', 'retweets', 'quotes'])

fig = make_subplots(rows=1, cols=4, specs=[[{'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}],
                                          ],
                    subplot_titles=('Likes', 'Replies', 'Retweets', 'Quotes (Retweets with Comments)' ))

fig.add_trace(go.Pie(labels=engagement_df['target_binary'], values=engagement_df['favourites'], name='Likes',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=1)
fig.add_trace(go.Pie(labels=engagement_df['target_binary'], values=engagement_df['replies'], name='Replies',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=2)
fig.add_trace(go.Pie(labels=engagement_df['target_binary'], values=engagement_df['retweets'], name='Retweets',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=3)
fig.add_trace(go.Pie(labels=engagement_df['target_binary'], values=engagement_df['quotes'], name='Quotes (Retweets with Comments)',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=4)

fig.update_layout(
    width=1000,
    height=400,
    showlegend=False,
)

fig.show()

In [None]:
# User Metric Distibutions
user_df = create_grouped_df(df, 'target_binary', ['followers_count', 'friends_count', 'favourites_count', 'statuses_count', 'following'])

fig = make_subplots(rows=1, cols=4, specs=[[{'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}],
                                          ],
                    subplot_titles=('Followers', 'Friends', 'Σ Likes', 'Σ Tweets', ))

fig.add_trace(go.Pie(labels=user_df['target_binary'], values=user_df['followers_count'], name='Followers',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=1)
fig.add_trace(go.Pie(labels=user_df['target_binary'], values=user_df['friends_count'], name='Friends',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=2)
fig.add_trace(go.Pie(labels=user_df['target_binary'], values=user_df['favourites_count'], name='Σ Likes',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=3)
fig.add_trace(go.Pie(labels=user_df['target_binary'], values=user_df['statuses_count'], name='Σ Tweets',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=4)
fig.update_layout(
    width=1000,
    height=400,
    showlegend=False,
)

fig.show()

In [None]:
# Language Metric Distibutions
words_df = create_grouped_df(df, 'target_binary', ['Word count', 'Max word length', 'Min word length', 'Average word length', 'long_word_freq', 'short_word_freq', 'unique_count', 'total_count',])


fig = make_subplots(rows=1, cols=4, specs=[[{'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}],],
                    subplot_titles=('Max-Word Length', 'Min-Word Length', 'Long-Word Freq', 'Short-Word Freq', ))

fig.add_trace(go.Pie(labels=words_df['target_binary'], values=words_df['Max word length'], name='Max-Word Length',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=1)
fig.add_trace(go.Pie(labels=words_df['target_binary'], values=words_df['Min word length'], name='Min-Word Length',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=2)
fig.add_trace(go.Pie(labels=words_df['target_binary'], values=words_df['long_word_freq'], name='Long-Word Freq',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=3)
fig.add_trace(go.Pie(labels=words_df['target_binary'], values=words_df['short_word_freq'], name='Short-Word Freq',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=4)
fig.update_layout(
    width=1000,
    height=400,
    showlegend=False,
)

fig.show()

In [None]:
fig = make_subplots(rows=1, cols=3, specs=[[{'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}],],
                    subplot_titles=('Average Word Length', 'Total Word Count', 'Unique Words', ))

fig.add_trace(go.Pie(labels=words_df['target_binary'], values=words_df['Average word length'], name='Avg-Word Length',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=1)
fig.add_trace(go.Pie(labels=words_df['target_binary'], values=words_df['total_count'], name='Total Number of Words',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=2)
fig.add_trace(go.Pie(labels=words_df['target_binary'], values=words_df['unique_count'], name='Unique Words',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=3)
fig.update_layout(
    width=800,
    height=400,
    showlegend=False,
)

fig.show()

In [None]:
grammer_df = create_grouped_df(df, 'target_binary', ['present_verbs', 'past_verbs', 'adjectives',
       'adverbs', 'adpositions', 'pronouns', 'TOs', 'determiners',
       'conjunctions', 'dots', 'exclamation', 'questions', 'ampersand',
       'capitals', 'digits', 'URLs', ])

fig = make_subplots(rows=2, cols=8, specs=[[{'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}], 
                                           [{'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}],
                                          ],
                    subplot_titles=('Present Verbs', 'Past Verbs', 'Adjectives', 'Adverbs', 'Adpositions', 'Pronouns', 'TOs', 'Determiners', 'Conjunctions', 'Dots (.)', 'Exclamations (!)', 'Questions (?)', 'Ampersands (&)', 'Capitals', 'Digits', 'URLs'))

fig.add_trace(go.Pie(labels=grammer_df['target_binary'], values=grammer_df['present_verbs'], name='Max-Word Length',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=1)
fig.add_trace(go.Pie(labels=grammer_df['target_binary'], values=grammer_df['past_verbs'], name='Min-Word Length',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=2)
fig.add_trace(go.Pie(labels=grammer_df['target_binary'], values=grammer_df['adjectives'], name='Long-Word Freq',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=3)
fig.add_trace(go.Pie(labels=grammer_df['target_binary'], values=grammer_df['adverbs'], name='Short-Word Freq',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=4)
fig.add_trace(go.Pie(labels=grammer_df['target_binary'], values=grammer_df['adpositions'], name='Max-Word Length',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=5)
fig.add_trace(go.Pie(labels=grammer_df['target_binary'], values=grammer_df['pronouns'], name='Min-Word Length',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=6)
fig.add_trace(go.Pie(labels=grammer_df['target_binary'], values=grammer_df['TOs'], name='Long-Word Freq',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=7)
fig.add_trace(go.Pie(labels=grammer_df['target_binary'], values=grammer_df['determiners'], name='Short-Word Freq',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=8)

fig.add_trace(go.Pie(labels=grammer_df['target_binary'], values=grammer_df['conjunctions'], name='Max-Word Length',
                     textinfo='label+percent', hoverinfo='label+value'), row=2, col=1)
fig.add_trace(go.Pie(labels=grammer_df['target_binary'], values=grammer_df['dots'], name='Min-Word Length',
                     textinfo='label+percent', hoverinfo='label+value'), row=2, col=2)
fig.add_trace(go.Pie(labels=grammer_df['target_binary'], values=grammer_df['exclamation'], name='Long-Word Freq',
                     textinfo='label+percent', hoverinfo='label+value'), row=2, col=3)
fig.add_trace(go.Pie(labels=grammer_df['target_binary'], values=grammer_df['questions'], name='Short-Word Freq',
                     textinfo='label+percent', hoverinfo='label+value'), row=2, col=4)
fig.add_trace(go.Pie(labels=grammer_df['target_binary'], values=grammer_df['ampersand'], name='Max-Word Length',
                     textinfo='label+percent', hoverinfo='label+value'), row=2, col=5)
fig.add_trace(go.Pie(labels=grammer_df['target_binary'], values=grammer_df['capitals'], name='Min-Word Length',
                     textinfo='label+percent', hoverinfo='label+value'), row=2, col=6)
fig.add_trace(go.Pie(labels=grammer_df['target_binary'], values=grammer_df['digits'], name='Long-Word Freq',
                     textinfo='label+percent', hoverinfo='label+value'), row=2, col=7)
fig.add_trace(go.Pie(labels=grammer_df['target_binary'], values=grammer_df['URLs'], name='Short-Word Freq',
                     textinfo='label+percent', hoverinfo='label+value'), row=2, col=8)

fig.update_layout(
    width=1200,
    height=450,
    showlegend=False,
    margin=dict(l=20, r=20, t=85, b=20)
)

fig.show()


In [None]:
def count_specific_words_separate_columns(df, column_name):
    # Define the words to search for and their corresponding column names
    words_to_columns = {
        'MONEY': 'money_count',
        'URL': 'url_count',
        'PERCENT': 'percent_count',
        'NUM': 'num_count'
    }
    
    # Initialize columns to 0
    for word, column in words_to_columns.items():
        df[column] = 0

    # Count occurrences of each word and update the respective column
    for word, column in words_to_columns.items():
        pattern = re.compile(word, re.IGNORECASE)
        df[column] = df[column_name].apply(lambda x: len(pattern.findall(str(x))))

    return df

In [None]:
count_df = count_specific_words_separate_columns(df, 'tweet_y')
count_df

In [None]:
# Language Metric Distibutions
words_df = create_grouped_df(count_df, 'target_binary', ['money_count', 'percent_count', 'num_count', 'url_count'])


fig = make_subplots(rows=1, cols=3, specs=[[{'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}],],
                    subplot_titles=('MONEY Count', 'PERCENT Count', 'NUM Count (excluding years)', 'URL Count'))

fig.add_trace(go.Pie(labels=words_df['target_binary'], values=words_df['money_count'], name='MONEY',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=1)
fig.add_trace(go.Pie(labels=words_df['target_binary'], values=words_df['percent_count'], name='PERCENT',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=2)
fig.add_trace(go.Pie(labels=words_df['target_binary'], values=words_df['num_count'], name='NUM',
                     textinfo='label+percent', hoverinfo='label+value'), row=1, col=3)
fig.update_layout(
    width=800,
    height=400,
    showlegend=False,
)

fig.show()