In [15]:
import pandas as pd
import numpy as np
import yaml

def load_data(file_path, config):
    df = pd.read_csv(file_path)
    df = df.dropna(subset=["hash"]).reset_index()
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s")
    
    # values = ["afd", "spd", "gruene", "linke", "cdu", "fdp", "linke", "unknown"]
    # df["party"] = np.random.choice (values, size=len(df))
    
    # party_color_map = {
    #     "afd":      config["party_color_map"]["afd"],     # Light Blue
    #     "spd":      config["party_color_map"]["spd"],     # Light Red
    #     "gruene":    config["party_color_map"]["gruene"],   # Light Green
    #     "linke":    config["party_color_map"]["linke"],   # Thistle (Light Purple)
    #     "cdu":      config["party_color_map"]["cdu"],     # Gray (Light Black)
    #     "fdp":      config["party_color_map"]["fdp"],     # Light Yellow
    #     "unknown":  config["party_color_map"]["unknown"]  # Light Gray
    # }
    # df['color'] = df['party'].map(party_color_map)
    return df

with open("../config.yaml", "r") as file:
    config = yaml.safe_load(file)
    
df = load_data("../data/posts.csv", config)

print(df['party'].unique())

print("Length:", len(df.index))
df.head()
print(len(df['user_id'].unique()))

[nan 'DIE LINKE' 'FDP' 'DIE GRÜNEN' 'SPD' 'AFD' 'CDU/CSU']
Length: 84113
23282


In [7]:
import plotly.express as px

platform_counts = df["platform"].value_counts().reset_index()
platform_counts.columns = ["platform", "count"]
print(platform_counts)

fig = px.bar(platform_counts, x="platform", y="count", title="Posts per Platform")
fig.show()

  platform  count
0       ig  34894
1       fb  33816
2       tw  15403


In [8]:
unique_imgs = len(df["img_id"].unique())
unique_hashes = len(df["hash"].unique())
print(f"Unique images posted: {unique_imgs} \nUnique hashes: {unique_hashes}")

value_counts = df['hash'].value_counts()
frequency_counts = value_counts.value_counts()

value_counts_df = value_counts.reset_index()
value_counts_df.columns = ['Value', 'Frequency']

fig = px.histogram(value_counts_df, x='Frequency', title='Number of image hash frequencies', labels={'col': 'Value'}, log_y=True)
fig.show()

Unique images posted: 84113 
Unique hashes: 70269


Filter out image hashes that appear in less than n posts

In [9]:
def min_k_shares(df, k):
    hash_counts = df['hash'].value_counts()
    mask = df['hash'].isin(hash_counts[hash_counts >= k].index)
    filtered_df = df[mask]
    filtered_df = filtered_df.reset_index()
    filtered_df = filtered_df.drop(["level_0", "index"], axis=1)
    return filtered_df

print(len(min_k_shares(df, k=4)))

7039


Filter out image hashes that do not appear on at least n different platforms

In [10]:
# k ... number of different platforms an image has to be posted on
def min_k_platforms(df, k):
    hash_platforms_counts = df.groupby('hash')['platform'].nunique()
    valid_hashes = hash_platforms_counts[hash_platforms_counts >= k].index
    filtered_df = df[df['hash'].isin(valid_hashes)]
    return filtered_df

filtered_df = min_k_platforms(df, k=3)
print(filtered_df.shape[0])
print(len(filtered_df["hash"].unique()))

1035
130


In [11]:
filtered_df = min_k_shares(df, k=10)
filtered_df = min_k_platforms(filtered_df, k=3)
print(len(filtered_df))

hash_counts = df['hash'].value_counts()
print(hash_counts)
print(df[df['hash'] == '4303f4766d8e9d32'])

630
hash
4303f4766d8e9d32    108
2551a5b85add964b     54
2996cc4b965956ec     53
e9691c869ce7161e     53
a95394ad16ed1396     47
                   ... 
c15ccea6768aa596      1
3371d385a12d4f47      1
8fe8dc2226932fa6      1
bb4d32a9164ba956      1
f7c319788bc91d06      1
Name: count, Length: 70269, dtype: int64
       index           user_id                          id_user_post  \
380      380   100044624100412    fb_100044624100412_402057477958386   
396      396   100044966265501    fb_100044966265501_394876042021268   
513      513   100058163614308    fb_100058163614308_267505951864857   
1020    1021   100539700021110   fb_100539700021110_6139030859505267   
1090    1091  1005969872851982  fb_1005969872851982_4237135969735340   
...      ...               ...                                   ...   
46215  46384   805370516512304   fb_805370516512304_1428490264200323   
46254  46423       80635590663      fb_80635590663_10159304583575664   
46258  46427   806529486112063   fb_80

Timeline of one image cross platform

In [12]:
selected_hash = '2551a5b85add964b'
df_one_hash = df[df['hash'] == selected_hash]

# Sort data by timestamp to connect dots chronologically
df_filtered = df_one_hash.sort_values(by='timestamp')

# Define the order of platforms
platform_order = ['ig', 'fb', 'tw']  # Change to your specific platform codes if different

# Convert platform to categorical type with a specific order
df_filtered['platform'] = pd.Categorical(df_filtered['platform'], categories=platform_order, ordered=True)

# Add a line trace with dark grey color connecting temporal succeeding dots (line comes first)
fig = px.line(
    df_filtered, 
    x='timestamp', 
    y='platform',  # Use platform directly (no jitter)
    line_shape='linear'
).update_traces(line_color='darkgrey')

# Now add the scatter plot on top (for bigger dots and color by platform)
scatter_fig = px.scatter(
    df_filtered,
    x='timestamp',             # Time of the post
    y='platform',              # Use platform directly (no jitter)
    color='platform',          # Color each point by platform (Instagram, Facebook, Twitter)
    hover_data=['img_id', 'party', 'name'],  # Additional hover data
    labels={'platform': 'Platform'},
    title=f'Timeline of Reposts for Image Hash: {selected_hash}',
    height=600,
    width=1000
).update_traces(marker=dict(size=12))  # Increase marker size here

# Combine the two plots: Line first, dots on top
fig.add_traces(scatter_fig.data)

# Update layout for better readability
fig.update_layout(
    yaxis_title='Platform',
    yaxis=dict(tickvals=list(range(len(platform_order))), ticktext=platform_order),  # Set y-ticks to fixed order
    xaxis_title='Time',
    legend_title='Platform',
    hovermode='closest',
)

# Show the plot
fig.show()

In [13]:
import plotly.graph_objects as go

hash_counts = df['hash'].value_counts()
top_10_hashes = hash_counts.iloc[0:10]
most_shared_images = df[df['hash'].isin(list(top_10_hashes.index))]
counts = most_shared_images.groupby(['hash', 'platform'])['platform'].count()

# Convert to DataFrame and pivot the data to get 'platform' counts as columns
df_temp = counts.unstack(fill_value=0)  # Pivot to get platforms as columns

# Create the Plotly figure
fig = go.Figure()

platform_colors = {
    'fb': '#4267B2',    # Facebook's blue
    'ig': '#E4405F',    # Instagram's reddish-pink
    'tw': '#1DA1F2'     # Twitter's light blue
}

# Add a bar for each platform (fb, ig, tw)
for platform in df_temp.columns:
    fig.add_trace(go.Bar(
        x=df_temp.index,  # X-axis: the hash values
        y=df_temp[platform],  # Y-axis: counts per platform
        name=platform,  # Name of the platform for the legend
        marker_color=platform_colors[platform]
    ))

# Update layout for stacked bars
fig.update_layout(
    barmode='stack',  # Stacked bar mode
    title="Stacked Bar Chart of Platform Counts by Hash",
    xaxis_title="Hash",
    yaxis_title="Count",
    xaxis={'categoryorder': 'total descending'},  # Sort bars by total count
)

# Show the plot
fig.show()