In [89]:
import pandas as pd
import numpy as np
import yaml

def load_data(file_path, config):
    df = pd.read_csv(file_path)
    df = df.dropna(subset=["hash"]).reset_index()
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s")
    
    values = ["afd", "spd", "gruene", "linke", "cdu", "fdp", "linke", "unknown"]
    df["party"] = np.random.choice (values, size=len(df))
    
    party_color_map = {
        "afd":      config["party_color_map"]["afd"],     # Light Blue
        "spd":      config["party_color_map"]["spd"],     # Light Red
        "gruene":    config["party_color_map"]["gruene"],   # Light Green
        "linke":    config["party_color_map"]["linke"],   # Thistle (Light Purple)
        "cdu":      config["party_color_map"]["cdu"],     # Gray (Light Black)
        "fdp":      config["party_color_map"]["fdp"],     # Light Yellow
        "unknown":  config["party_color_map"]["unknown"]  # Light Gray
    }
    df['color'] = df['party'].map(party_color_map)
    return df

with open("../config.yaml", "r") as file:
    config = yaml.safe_load(file)
    
df = load_data("../data/multiplatform_hashed_visuals.csv", config)

print("Length:", len(df.index))
df.head()

Length: 84113


Unnamed: 0,index,id_user_post,user_id,name,timestamp,platform,img_id,hash,party,color
0,0,fb_100021691363_10157840358841364,100021691363,Sören Link,2021-08-16 16:05:50,fb,fb_100021691363_10157840358841364,87d5b928361ca35b,spd,#EB001F
1,1,fb_100024665176055_794033051428860,100024665176055,Wissenschaftspark,2021-09-10 10:59:43,fb,fb_100024665176055_794033051428860,93776d8061df2361,fdp,#FFED00
2,2,fb_100028342704825_704743473813736,100028342704825,Ahrweinshop und ahrland Dankos Weinladen,2021-09-09 00:02:50,fb,fb_100028342704825_704743473813736,37624db3e258f01b,linke,#BE3075
3,3,fb_100038686975829_507394040560157,100038686975829,Saturday's Heroes - Der Fußball-Blog,2021-09-23 13:01:41,fb,fb_100038686975829_507394040560157,a36f5c6dc94944cc,spd,#EB001F
4,4,fb_100038899098192_502667104373255,100038899098192,Landrat Elmar Stegmann,2021-09-07 18:09:03,fb,fb_100038899098192_502667104373255,5d0ff57e018ce548,linke,#BE3075


In [90]:
import plotly.express as px

platform_counts = df["platform"].value_counts().reset_index()
platform_counts.columns = ["platform", "count"]
print(platform_counts)

fig = px.bar(platform_counts, x="platform", y="count", title="Posts per Platform")
fig.show()

  platform  count
0       ig  34894
1       fb  33816
2       tw  15403


In [91]:
unique_imgs = len(df["img_id"].unique())
unique_hashes = len(df["hash"].unique())
print(f"Unique images posted: {unique_imgs} \nUnique hashes: {unique_hashes}")

value_counts = df['hash'].value_counts()
frequency_counts = value_counts.value_counts()

value_counts_df = value_counts.reset_index()
value_counts_df.columns = ['Value', 'Frequency']

fig = px.histogram(value_counts_df, x='Frequency', title='Number of image hash frequencies', labels={'col': 'Value'}, log_y=True)
fig.show()

Unique images posted: 84113 
Unique hashes: 70269


Filter out image hashes that appear in less than n posts

In [92]:
def min_k_shares(df, k):
    hash_counts = df['hash'].value_counts()
    mask = df['hash'].isin(hash_counts[hash_counts >= k].index)
    filtered_df = df[mask]
    filtered_df = filtered_df.reset_index()
    filtered_df = filtered_df.drop(["level_0", "index"], axis=1)
    return filtered_df

print(len(min_k_shares(df, k=4)))

7039


Filter out image hashes that do not appear on at least n different platforms

In [93]:
# k ... number of different platforms an image has to be posted on
def min_k_platforms(df, k):
    hash_platforms_counts = df.groupby('hash')['platform'].nunique()
    valid_hashes = hash_platforms_counts[hash_platforms_counts >= k].index
    filtered_df = df[df['hash'].isin(valid_hashes)]
    return filtered_df

filtered_df = min_k_platforms(df, k=3)
print(filtered_df.shape[0])
print(len(filtered_df["hash"].unique()))

1035
130


In [94]:
filtered_df = min_k_shares(df, k=10)
filtered_df = min_k_platforms(filtered_df, k=3)
print(len(filtered_df))

hash_counts = df['hash'].value_counts()
print(hash_counts)
print(df[df['hash'] == '4303f4766d8e9d32'])

630
hash
4303f4766d8e9d32    108
2551a5b85add964b     54
e9691c869ce7161e     53
2996cc4b965956ec     53
6d46f8a687890bf8     47
                   ... 
7d2fa170618e17cc      1
217d60f5670f2e2a      1
e3770d01c45ebb89      1
01afaaf80ef0dc74      1
7d7e978886508fb0      1
Name: count, Length: 70269, dtype: int64
       index                         id_user_post          user_id  \
134      134   fb_100044624100412_402057477958386  100044624100412   
150      150   fb_100044966265501_394876042021268  100044966265501   
152      152   fb_100045139321790_389998672514774     198081108519   
267      267   fb_100058163614308_267505951864857  100058163614308   
712      712  fb_100539700021110_6139030859505267  100539700021110   
...      ...                                  ...              ...   
33404  33420     fb_93711056469_10157962434301470      93711056469   
37176  37192        ig_belab_official_CTcci-Now-G   belab_official   
77817  77918    tw_2849808231_1441871192256827393       

Timeline of one image cross platform

In [95]:
selected_hash = '2551a5b85add964b'
df_one_hash = df[df['hash'] == selected_hash]

# Sort data by timestamp to connect dots chronologically
df_filtered = df_one_hash.sort_values(by='timestamp')

# Define the order of platforms
platform_order = ['ig', 'fb', 'tw']  # Change to your specific platform codes if different

# Convert platform to categorical type with a specific order
df_filtered['platform'] = pd.Categorical(df_filtered['platform'], categories=platform_order, ordered=True)

# Add a line trace with dark grey color connecting temporal succeeding dots (line comes first)
fig = px.line(
    df_filtered, 
    x='timestamp', 
    y='platform',  # Use platform directly (no jitter)
    line_shape='linear'
).update_traces(line_color='darkgrey')

# Now add the scatter plot on top (for bigger dots and color by platform)
scatter_fig = px.scatter(
    df_filtered,
    x='timestamp',             # Time of the post
    y='platform',              # Use platform directly (no jitter)
    color='platform',          # Color each point by platform (Instagram, Facebook, Twitter)
    hover_data=['img_id', 'party', 'name'],  # Additional hover data
    labels={'platform': 'Platform'},
    title=f'Timeline of Reposts for Image Hash: {selected_hash}',
    height=600,
    width=1000
).update_traces(marker=dict(size=12))  # Increase marker size here

# Combine the two plots: Line first, dots on top
fig.add_traces(scatter_fig.data)

# Update layout for better readability
fig.update_layout(
    yaxis_title='Platform',
    yaxis=dict(tickvals=list(range(len(platform_order))), ticktext=platform_order),  # Set y-ticks to fixed order
    xaxis_title='Time',
    legend_title='Platform',
    hovermode='closest',
)

# Show the plot
fig.show()