In [1]:
import pandas as pd
import numpy as np
import yaml

def load_data(file_path, config):
    df = pd.read_csv(file_path)
    df = df.dropna(subset=["hash"]).reset_index()
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s")
    
    values = ["afd", "spd", "gruene", "linke", "cdu", "fdp", "linke", "unknown"]
    df["party"] = np.random.choice (values, size=len(df))
    
    party_color_map = {
        "afd":      config["party_color_map"]["afd"],     # Light Blue
        "spd":      config["party_color_map"]["spd"],     # Light Red
        "gruene":    config["party_color_map"]["gruene"],   # Light Green
        "linke":    config["party_color_map"]["linke"],   # Thistle (Light Purple)
        "cdu":      config["party_color_map"]["cdu"],     # Gray (Light Black)
        "fdp":      config["party_color_map"]["fdp"],     # Light Yellow
        "unknown":  config["party_color_map"]["unknown"]  # Light Gray
    }
    df['color'] = df['party'].map(party_color_map)
    return df

with open("../config.yaml", "r") as file:
    config = yaml.safe_load(file)
    
df = load_data("../data/multiplatform_hashed_visuals.csv", config)

print("Length:", len(df.index))
df.head()

Length: 84113


Unnamed: 0,index,id_user_post,user_id,name,timestamp,platform,img_id,hash,party,color
0,0,fb_100021691363_10157840358841364,100021691363,Sören Link,2021-08-16 16:05:50,fb,fb_100021691363_10157840358841364,87d5b928361ca35b,cdu,#000000
1,1,fb_100024665176055_794033051428860,100024665176055,Wissenschaftspark,2021-09-10 10:59:43,fb,fb_100024665176055_794033051428860,93776d8061df2361,spd,#EB001F
2,2,fb_100028342704825_704743473813736,100028342704825,Ahrweinshop und ahrland Dankos Weinladen,2021-09-09 00:02:50,fb,fb_100028342704825_704743473813736,37624db3e258f01b,gruene,#64A12D
3,3,fb_100038686975829_507394040560157,100038686975829,Saturday's Heroes - Der Fußball-Blog,2021-09-23 13:01:41,fb,fb_100038686975829_507394040560157,a36f5c6dc94944cc,gruene,#64A12D
4,4,fb_100038899098192_502667104373255,100038899098192,Landrat Elmar Stegmann,2021-09-07 18:09:03,fb,fb_100038899098192_502667104373255,5d0ff57e018ce548,gruene,#64A12D


In [2]:
import plotly.express as px

platform_counts = df["platform"].value_counts().reset_index()
platform_counts.columns = ["platform", "count"]
print(platform_counts)

fig = px.bar(platform_counts, x="platform", y="count", title="Posts per Platform")
fig.show()

  platform  count
0       ig  34894
1       fb  33816
2       tw  15403


In [3]:
unique_imgs = len(df["img_id"].unique())
unique_hashes = len(df["hash"].unique())
print(f"Unique images posted: {unique_imgs} \nUnique hashes: {unique_hashes}")

value_counts = df['hash'].value_counts()
frequency_counts = value_counts.value_counts()

value_counts_df = value_counts.reset_index()
value_counts_df.columns = ['Value', 'Frequency']

fig = px.histogram(value_counts_df, x='Frequency', title='Number of image hash frequencies', labels={'col': 'Value'}, log_y=True)
fig.show()

Unique images posted: 84113 
Unique hashes: 70269


Filter out image hashes that appear in less than n posts

In [4]:
hash_counts = df['hash'].value_counts()

# Step 2: Create a mask for hashes that appear 4 or more times
mask = df['hash'].isin(hash_counts[hash_counts >= 4].index)

# Step 3: Filter the DataFrame
filtered_df = df[mask]
filtered_df = filtered_df.reset_index()
filtered_df = filtered_df.drop(["level_0", "index"], axis=1)

# Display the filtered DataFrame
display(filtered_df)

Unnamed: 0,id_user_post,user_id,name,timestamp,platform,img_id,hash,party,color
0,fb_100044140134141_408439260637408,100044140134141,ruthe.de,2021-09-22 09:34:55,fb,fb_100044140134141_408439260637408,9396a4337373632a,spd,#EB001F
1,fb_100044219059922_389270122556953,100044219059922,Martin Dulig,2021-08-30 08:55:52,fb,fb_100044219059922_389270122556953,13fa2543bc05faac,spd,#EB001F
2,fb_100044268707373_417092063109747,100044268707373,Paul Ziemiak,2021-08-30 18:49:36,fb,fb_100044268707373_417092063109747,0bf15e20afa127f8,fdp,#FFED00
3,fb_100044281126723_408710297281671,100044281126723,Claudia Jung,2021-09-05 19:04:26,fb,fb_100044281126723_408710297281671,8d0670f4c3f921db,spd,#EB001F
4,fb_100044290216747_399357158217288,100044290216747,Raul Krauthausen,2021-09-02 11:00:14,fb,fb_100044290216747_399357158217288,fd47298396ec09f0,afd,#009EE0
...,...,...,...,...,...,...,...,...,...
7034,tw_994702020_1432255572838129671,994702020,,2021-08-30 00:00:00,tw,tw_994702020_1432255572838129671,2bafc1433c2a2dd6,gruene,#64A12D
7035,tw_995519538_1437172259731484672,995519538,anaudretsch,2021-09-12 00:00:00,tw,tw_995519538_1437172259731484672,dd05a5d64f21fa81,gruene,#64A12D
7036,tw_996758138752765952_1433755159381827601,996758138752765952,,2021-09-03 00:00:00,tw,tw_996758138752765952_1433755159381827601,492dcee526c99636,cdu,#000000
7037,tw_996758138752765952_1436654266706538505,996758138752765952,,2021-09-11 00:00:00,tw,tw_996758138752765952_1436654266706538505,e333c3199f321b38,spd,#EB001F


In [5]:
unique_imgs = len(filtered_df["img_id"].unique())
unique_hashes = len(filtered_df["hash"].unique())
print(f"Unique images posted: {unique_imgs} \nUnique hashes: {unique_hashes}")

value_counts = filtered_df['hash'].value_counts()
frequency_counts = value_counts.value_counts()

value_counts_df = value_counts.reset_index()
value_counts_df.columns = ['Value', 'Frequency']


Unique images posted: 7039 
Unique hashes: 995


Filter out image hashes that do not appear on at least n different platforms

In [10]:
hash_platforms_counts = df.groupby('hash')['platform'].nunique()
valid_hashes = hash_platforms_counts[hash_platforms_counts >= 3].index
filtered_df = df[df['hash'].isin(valid_hashes)]
print(len(df))
display(filtered_df)

84113


Unnamed: 0,index,id_user_post,user_id,name,timestamp,platform,img_id,hash,party,color
48,48,fb_100044290216747_399357158217288,100044290216747,Raul Krauthausen,2021-09-02 11:00:14,fb,fb_100044290216747_399357158217288,fd47298396ec09f0,afd,#009EE0
99,99,fb_100044489377115_423837059109270,100044489377115,Katarina Barley,2021-09-17 12:54:58,fb,fb_100044489377115_423837059109270,2551a5b85add964b,cdu,#000000
134,134,fb_100044624100412_402057477958386,100044624100412,Bela B,2021-09-05 16:59:33,fb,fb_100044624100412_402057477958386,4303f4766d8e9d32,spd,#EB001F
150,150,fb_100044966265501_394876042021268,100044966265501,Wolf Maahn,2021-09-06 20:57:44,fb,fb_100044966265501_394876042021268,4303f4766d8e9d32,unknown,#404040
152,152,fb_100045139321790_389998672514774,198081108519,Max Prosa,2021-09-06 18:01:40,fb,fb_100045139321790_389998672514774,4303f4766d8e9d32,linke,#BE3075
...,...,...,...,...,...,...,...,...,...,...
83563,83748,tw_935917067689168898_1436218483260411904,935917067689168898,,2021-09-10 00:00:00,tw,tw_935917067689168898_1436218483260411904,6d46f8a687890bf8,unknown,#404040
83585,83770,tw_937678874095300609_1439677370978480128,937678874095300609,katjamast,2021-09-19 00:00:00,tw,tw_937678874095300609_1439677370978480128,9925cef626d4b464,cdu,#000000
83973,84164,tw_98327012_1438875421920092160,98327012,,2021-09-17 00:00:00,tw,tw_98327012_1438875421920092160,2551a5b85add964b,afd,#009EE0
84042,84233,tw_991397404887969792_1428289825896079366,991397404887969792,,2021-08-19 00:00:00,tw,tw_991397404887969792_1428289825896079366,2fef87927c82914a,cdu,#000000
