In [None]:
#@title Twitter Account Classification Using Account Metadata: Organization vs. Individual Paper Interactive Data Analysis
#@markdown The collected Humanizr dataset consists of 17,790 user accounts, in which 16,012 of them are labeled as individuals, and 1,778 of them are labeled as organizations. In the Demographer dataset, there are 214,236 accounts in which 185,224 are labeled as individuals, and 29,012 are labeled as an organization. Since the Demographer dataset comprises the Humanizr dataset, the figures are based on the Demographer dataset in the following sections.
#@markdown <br/> <br/> NOTE: Run this cell once to download the dataset and preprocess it. Please double click this cell to view the codes. 
%%capture
import numpy as np
import pandas as pd
import plotly.express as px
import os
from google_drive_downloader import GoogleDriveDownloader as gdd

pd.set_option('display.max_columns', None)

def download_content(file_id, path, unzip=True, overwrite=False):
  if "drive.google.com" in file_id:
    idx = file_id.find("id=")
    file_id = file_id[idx+3:]
    print("file_id fetched from url: ", file_id)

  if os.path.exists(path) and not overwrite:
    print("Dataset is already downloaded. path: ", path)
    return

  folder = os.path.dirname(path)
  if not os.path.exists(folder):
    os.makedirs(folder)
  gdd.download_file_from_google_drive(file_id=file_id, dest_path= path, 
                                      unzip=unzip, overwrite=overwrite)
# Download the dataset
file_id, file_name = ("1h8Fdh0ASsM3SBuw6p20bzXJTCUYHuBT1", "demographer_all.csv")
path = "./" + file_name
download_content(file_id, path, overwrite=False)

# IMPORT DEMOGRAPHER DATA INTO PANDAS DATAFRAME
user_list_path = 'demographer_all.csv'

user_list = pd.read_csv(user_list_path)

# Eliminate NAN values
data = user_list.replace(np.nan, '', regex=True)
# Make sure to truncate rows with corrupted labels.
data = data.loc[(data['label'] == 'organization') | (data['label'] == 'individual')]


In [None]:
#@markdown Select numeric features for x_axis and y_axis to visualize the plot and run this cell. You can toggle labels by clicking on the legend.
x_axis = 'followers' #@param ["followers", "following", "tweets", "likes", "media"]
y_axis = 'following' #@param ["followers", "following", "tweets", "likes", "media"]
fig = px.scatter(data, x="{}".format(x_axis), y="{}".format(y_axis), color="label",
                      color_discrete_map={'individual': 'red', 'organization': 'royalblue'}, hover_data=['username'])
fig.update_layout(
    font=dict(
        size=18
    )
)
fig.show()