In [108]:
import json
import ast

import pandas as pd
import numpy as np

import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.graph_objects as go
pio.templates.default = "simple_white"

import warnings
warnings.filterwarnings('ignore')

In [109]:
df_tweet_processed = pd.read_csv('./data/total_tweet_data_preprocessed.csv')
df_tweet_processed.drop(columns = "Unnamed: 0", inplace = True)
df_tweet_processed['created_at_dt_thtz'] = pd.to_datetime(df_tweet_processed['created_at_dt_thtz']).dt.tz_localize(None)

df_tweet_processed['created_at_dt_thtz'] = pd.to_datetime(df_tweet_processed['created_at_dt_thtz'])
df_tweet_processed['created_at_date_thtz'] = pd.to_datetime(df_tweet_processed['created_at_date_thtz'])
df_tweet_processed['created_year_month'] = df_tweet_processed['created_at_date_thtz'].dt.strftime('%Y-%m')

# Date Conversion
df_tweet_processed['created_at_dt_month'] = df_tweet_processed['created_at_dt_thtz'].dt.month
df_tweet_processed['created_at_dt_month_name'] = df_tweet_processed['created_at_dt_thtz'].dt.month_name()
df_tweet_processed["created_at_dateweek"]= df_tweet_processed['created_at_dt_thtz'].dt.weekday
df_tweet_processed["created_at_dateweek_name"] = df_tweet_processed['created_at_dt_thtz'].dt.day_name()
df_tweet_processed["created_at_dt_hour"] = df_tweet_processed["created_at_dt_thtz"].dt.hour

In [110]:
count_spam = df_tweet_processed.drop_duplicates("id").groupby("Spam_Masking_Province").agg({'id' : 'count'}).reset_index()
count_spam.rename(columns = {"id" : "Count Tweet", "Spam_Masking_Province" : "Spam Masking"}, inplace = True)


spam_count_plot = px.histogram(count_spam, x = "Spam Masking", y = "Count Tweet", color = "Spam Masking",
                                width= 700 ,height= 500, title = "Number of Spam vs Non-Spam in Tweet")

#count_spam.to_csv(f"Number of Spam vs Non-Spam in Tweet.csv", encoding  = "utf-8-sig", index = 0)
#spam_count_plot.write_html(f"Number of Spam vs Non-Spam in Tweet.html")

spam_count_plot

In [111]:
def count_plot_by_group(df:pd.DataFrame, column_name:str):
    df_plot = df[df['Spam_Masking_Province'] == 'None-Spam']
    
    bkk_feature= df_plot.drop_duplicates(subset=['id', column_name])
    bkk_feature.dropna(subset = [ 'id', column_name], inplace = True)

    bkk_feature_tweet_count = bkk_feature.groupby(column_name).agg({"id" : "count"}).reset_index()


    pio.templates.default = "simple_white"
    bkk_feature_tweet_count.sort_values(by ='id', inplace = True, ascending=False)
    bkk_feature_tweet_count.rename(columns = {"id" : "Tweet Count"}, inplace = True)

    count_plot_by_column = px.bar(bkk_feature_tweet_count, x = column_name, y = "Tweet Count",
                                title = f"Number of Tweets by BKK {column_name}",
                                color =column_name, width= 700 ,height= 500)
    bkk_feature_tweet_count.to_csv(f"Number of Tweets by {column_name}.csv", encoding  = "utf-8-sig", index = 0)
    count_plot_by_column.write_html(f"Number of Tweets by {column_name}.html")
         
    return count_plot_by_column

In [112]:
target_col =['Gender_Term', 'Multiple_Partner',
'Sex_Work', 'Age', 'Institute', 'Bangkok_District', 'Bankok_ZoneName','Bankok_UrbanTerm_Mapping', 'Misc_Term']

In [113]:
multi_partner_feature= df_tweet_processed[df_tweet_processed['Spam_Masking_Province'] == "None-Spam"]
multi_partner_feature.drop_duplicates(subset=['id', 'Multiple_Partner'], inplace = True)

multi_partner_feature.dropna(subset = [ 'id', 'Multiple_Partner'], inplace = True)
multi_partner_tweet_count = multi_partner_feature.groupby(['created_year_month', 'Multiple_Partner']).agg({"id" : "count"}).reset_index()

pio.templates.default = "simple_white"
multi_partner_tweet_count.rename(columns = {"id" : "Tweet Count"}, inplace = True)

count_plot_by_column = px.line(multi_partner_tweet_count, x = 'created_year_month', y = "Tweet Count",
                            title = "Number of Tweets by Multiple Partner Term",
                            color ='Multiple_Partner', width= 700 ,height= 500)

count_plot_by_column.show()

In [None]:
multi_partner_tweet_count.to_csv(f"Number of Tweet by Multiple Partner Term.csv", encoding  = "utf-8-sig", index = 0)
count_plot_by_column.write_html(f"Number of Tweet by Multiple Partner Term.html")

In [None]:
bkk_feature= df_tweet_processed[df_tweet_processed['Spam_Masking_Province'] == "None-Spam"].drop_duplicates(subset=['id', 'Gender_Term', 'Bangkok_District'])
bkk_feature.dropna(subset = [ 'id', 'Gender_Term', 'Bangkok_District'], inplace = True)
bkk_feature_tweet_count = bkk_feature.groupby(['Gender_Term', 'Bangkok_District']).agg({"id" : "count"}).reset_index()

pio.templates.default = "simple_white"
bkk_feature_tweet_count.sort_values(by ='id', inplace = True, ascending=False)
bkk_feature_tweet_count.rename(columns = {"id" : "Tweet Count"}, inplace = True)
count_plot_by_column = px.bar(bkk_feature_tweet_count, x = 'Bangkok_District', y = "Tweet Count",
                            title = "Number of Tweets by BKK District and  Gender_Term",
                            color ='Gender_Term', width= 700 ,height= 500)

count_plot_by_column.show()

In [None]:
bkk_feature_tweet_count.to_csv(f"Number of Tweets by BKK District and  Gender_Term.csv", encoding  = "utf-8-sig", index = 0)     
count_plot_by_column.write_html("Number of Tweets by BKK District and  Gender_Term.html")

In [None]:
for colname in target_col:
    plotting = count_plot_by_group(df_tweet_processed, colname)
    plotting.show()