Filter out generally irelevant posts for this applications

In [7]:
import pandas as pd

def create_base_dataframe(file_path):
    df = pd.read_csv(file_path)
    df = df.dropna(subset=["hash"]).reset_index(drop=True)
    
    # Drop rows where the image only appears once in the entire dataset
    hash_counts = df['hash'].value_counts()
    df = df[df['hash'].isin(hash_counts[hash_counts > 1].index)]
    
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s")

    party_mapping = {
        'DIE LINKE': 'die_linke',
        'FDP': 'fdp',
        'DIE GRÜNEN': 'die_gruenen',
        'SPD': 'spd',
        'AFD': 'afd',
        'CDU/CSU': 'cdu_csu'
    }
    df['party'] = df['party'].map(party_mapping)
    df['party'] = df['party'].fillna('unknown')
    
    platform_mapping = {
        'fb': 'Facebook',
        'ig': 'Instagram',
        'tw': 'Twitter'
    }

    df['platform'] = df['platform'].map(platform_mapping)
    
    # Filter out accounts that have not shared at least one same image as another account
    hash_counts = df.groupby('hash')['user_id'].nunique()
    valid_hashes = hash_counts[hash_counts >= 2].index
    filtered_df = df[df['hash'].isin(valid_hashes)]

    print(filtered_df.shape[0])
    print(filtered_df)
    
    return df

df = create_base_dataframe('../data/original_posts.csv')
df.to_csv('../data/outputs/base_posts.csv', index=False)
print(df.shape[0])
df.head()

20081
                    user_id                         id_user_post  \
27          _andreaschwarz_       ig__andreaschwarz__CUE9BI8K46c   
79              _germanzero           ig__germanzero_CT35PD_qrBs   
90              _germanzero           ig__germanzero_CTFXm93KHDx   
96              _germanzero           ig__germanzero_CTObUrhq6yX   
104             _germanzero           ig__germanzero_CTh4QP8q8G6   
...                     ...                                  ...   
84025          zerocovid_de          ig_zerocovid_de_CUN7cIUsY3J   
84036  zerowaste_fuerberlin  ig_zerowaste_fuerberlin_CUMYyXUKM9D   
84041            zoe.gruene            ig_zoe.gruene_CUNWMI4M9q8   
84094    zurich_deutschland    ig_zurich_deutschland_CSoZPNBqIGj   
84096    zurich_deutschland    ig_zurich_deutschland_CUHxj6_Klez   

                     name           timestamp   platform  \
27         Andrea Schwarz 2021-09-21 10:43:30  Instagram   
79             GermanZero 2021-09-16 09:00:23  Instagram 

Unnamed: 0,user_id,id_user_post,name,timestamp,platform,img_id,hash,party
27,_andreaschwarz_,ig__andreaschwarz__CUE9BI8K46c,Andrea Schwarz,2021-09-21 10:43:30,Instagram,ig__andreaschwarz__CUE9BI8K46c,33430bfc74ab9c8c,unknown
79,_germanzero,ig__germanzero_CT35PD_qrBs,GermanZero,2021-09-16 09:00:23,Instagram,ig__germanzero_CT35PD_qrBs,074e38788687fc7c,unknown
90,_germanzero,ig__germanzero_CTFXm93KHDx,GermanZero,2021-08-27 18:04:27,Instagram,ig__germanzero_CTFXm93KHDx_1,edb10b0f9570f449,unknown
96,_germanzero,ig__germanzero_CTObUrhq6yX,GermanZero,2021-08-31 06:30:05,Instagram,ig__germanzero_CTObUrhq6yX_1,230ddcf31c1a73b4,unknown
104,_germanzero,ig__germanzero_CTh4QP8q8G6,GermanZero,2021-09-07 19:48:27,Instagram,ig__germanzero_CTh4QP8q8G6_3,937c6c83e4131ee6,unknown


Filters out posts where no party to the account is provided.

In [8]:
def df_with_parties(file_path):
    df = pd.read_csv(file_path)
    df = df[df['party'] != 'unknown']
    df = df.reset_index(drop=True)
    return df

df = df_with_parties('../data/outputs/base_posts.csv')
df.to_csv('../data/outputs/posts_with_party.csv', index=False)
print(df.shape[0])
display(df.head())

print(df['user_id'].value_counts(ascending=True))



5756


Unnamed: 0,user_id,id_user_post,name,timestamp,platform,img_id,hash,party
0,100021691363,fb_100021691363_10157840358841364,Sören Link,2021-08-16 16:05:50,Facebook,fb_100021691363_10157840358841364,87d5b928361ca35b,die_linke
1,100058135027750,fb_100058135027750_260335295914303,Wiebke Knell - FDP,2021-08-20 06:33:11,Facebook,fb_100058135027750_260335295914303,d79303dc92d6c994,fdp
2,100058135027750,fb_100058135027750_263081198973046,Wiebke Knell - FDP,2021-08-24 19:04:43,Facebook,fb_100058135027750_263081198973046,0f931b955567d4c4,fdp
3,100063616410667,fb_100063616410667_229064975890715,Das Grüne Netzwerk,2021-08-20 07:43:47,Facebook,fb_100063616410667_229064975890715,bfba838c25cc634c,die_gruenen
4,100063616410667,fb_100063616410667_236140568516489,Das Grüne Netzwerk,2021-08-30 09:03:33,Facebook,fb_100063616410667_236140568516489,3ded252303e6e3a4,die_gruenen


user_id
100440141955125       1
100453088976918       1
10150103571725193     1
1015232528672214      1
101620631633653       1
                     ..
120133508052257      32
636864479726453      33
876888638993185      46
488550411249641      46
369941321068587      48
Name: count, Length: 2316, dtype: int64


Drop rows with hashes that appear only for one platform in the entire dataset

In [9]:
def df_for_cross_platform_images(file_path):
    df = pd.read_csv(file_path)
    hash_platform_counts = df.groupby('hash')['platform'].nunique()
    valid_hashes = hash_platform_counts[hash_platform_counts > 1].index
    df = df[df['hash'].isin(valid_hashes)]
    return df
    
df = df_for_cross_platform_images('../data/outputs/base_posts.csv')
df.to_csv('../data/outputs/cross_platform_posts.csv', index=False)
print(df.shape[0])
df.head()

9310


Unnamed: 0,user_id,id_user_post,name,timestamp,platform,img_id,hash,party
0,_andreaschwarz_,ig__andreaschwarz__CUE9BI8K46c,Andrea Schwarz,2021-09-21 10:43:30,Instagram,ig__andreaschwarz__CUE9BI8K46c,33430bfc74ab9c8c,unknown
1,_germanzero,ig__germanzero_CT35PD_qrBs,GermanZero,2021-09-16 09:00:23,Instagram,ig__germanzero_CT35PD_qrBs,074e38788687fc7c,unknown
2,_germanzero,ig__germanzero_CTFXm93KHDx,GermanZero,2021-08-27 18:04:27,Instagram,ig__germanzero_CTFXm93KHDx_1,edb10b0f9570f449,unknown
3,_germanzero,ig__germanzero_CTObUrhq6yX,GermanZero,2021-08-31 06:30:05,Instagram,ig__germanzero_CTObUrhq6yX_1,230ddcf31c1a73b4,unknown
4,_germanzero,ig__germanzero_CTh4QP8q8G6,GermanZero,2021-09-07 19:48:27,Instagram,ig__germanzero_CTh4QP8q8G6_3,937c6c83e4131ee6,unknown
