Filter out generally irelevant posts for this applications

In [5]:
import pandas as pd

def create_base_dataframe(file_path):
    df = pd.read_csv(file_path)
    df = df.dropna(subset=["hash"]).reset_index()
    
    # Drop rows where the image only appears once in the entire dataset
    hash_counts = df['hash'].value_counts()
    df = df[df['hash'].isin(hash_counts[hash_counts > 1].index)]
    
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s")

    party_mapping = {
        'DIE LINKE': 'die_linke',
        'FDP': 'fdp',
        'DIE GRÜNEN': 'die_gruenen',
        'SPD': 'spd',
        'AFD': 'afd',
        'CDU/CSU': 'cdu_csu'
    }
    df['party'] = df['party'].map(party_mapping)
    df['party'] = df['party'].fillna('unknown')
    
    platform_mapping = {
        'fb': 'Facebook',
        'ig': 'Instagram',
        'tw': 'Twitter'
    }

    df['platform'] = df['platform'].map(platform_mapping)
    return df

df = create_base_dataframe('../data/original_posts.csv')
df.to_csv('../data/base_posts.csv')
print(df.shape[0])
df.head()

21546


Unnamed: 0,index,user_id,id_user_post,name,timestamp,platform,img_id,hash,party
27,27,_andreaschwarz_,ig__andreaschwarz__CUE9BI8K46c,Andrea Schwarz,2021-09-21 10:43:30,Instagram,ig__andreaschwarz__CUE9BI8K46c,33430bfc74ab9c8c,unknown
79,79,_germanzero,ig__germanzero_CT35PD_qrBs,GermanZero,2021-09-16 09:00:23,Instagram,ig__germanzero_CT35PD_qrBs,074e38788687fc7c,unknown
90,90,_germanzero,ig__germanzero_CTFXm93KHDx,GermanZero,2021-08-27 18:04:27,Instagram,ig__germanzero_CTFXm93KHDx_1,edb10b0f9570f449,unknown
96,96,_germanzero,ig__germanzero_CTObUrhq6yX,GermanZero,2021-08-31 06:30:05,Instagram,ig__germanzero_CTObUrhq6yX_1,230ddcf31c1a73b4,unknown
104,104,_germanzero,ig__germanzero_CTh4QP8q8G6,GermanZero,2021-09-07 19:48:27,Instagram,ig__germanzero_CTh4QP8q8G6_3,937c6c83e4131ee6,unknown


Filters out posts where no party to the account is provided.

In [6]:
def df_with_parties(file_path):
    df = pd.read_csv(file_path)
    df = df[df['party'] != 'unknown']
    df = df.reset_index()
    return df

df = df_with_parties('../data/base_posts.csv')
df.to_csv('../data/posts_with_party.csv')
print(df.shape[0])
df.head()

5756


Unnamed: 0.1,level_0,Unnamed: 0,index,user_id,id_user_post,name,timestamp,platform,img_id,hash,party
0,14,243,243,100021691363,fb_100021691363_10157840358841364,Sören Link,2021-08-16 16:05:50,Facebook,fb_100021691363_10157840358841364,87d5b928361ca35b,die_linke
1,136,507,507,100058135027750,fb_100058135027750_260335295914303,Wiebke Knell - FDP,2021-08-20 06:33:11,Facebook,fb_100058135027750_260335295914303,d79303dc92d6c994,fdp
2,137,508,508,100058135027750,fb_100058135027750_263081198973046,Wiebke Knell - FDP,2021-08-24 19:04:43,Facebook,fb_100058135027750_263081198973046,0f931b955567d4c4,fdp
3,164,569,569,100063616410667,fb_100063616410667_229064975890715,Das Grüne Netzwerk,2021-08-20 07:43:47,Facebook,fb_100063616410667_229064975890715,bfba838c25cc634c,die_gruenen
4,165,570,570,100063616410667,fb_100063616410667_236140568516489,Das Grüne Netzwerk,2021-08-30 09:03:33,Facebook,fb_100063616410667_236140568516489,3ded252303e6e3a4,die_gruenen


Drop rows with hashes that appear only for one platform in the entire dataset

In [7]:
def df_for_cross_platform_images(file_path):
    df = pd.read_csv(file_path)
    hash_platform_counts = df.groupby('hash')['platform'].nunique()
    valid_hashes = hash_platform_counts[hash_platform_counts > 1].index
    df = df[df['hash'].isin(valid_hashes)]
    return df
    
df = df_for_cross_platform_images('../data/base_posts.csv')
df.to_csv('../data/cross_platform_posts.csv')
print(df.shape[0])
df.head()

9310


Unnamed: 0.1,Unnamed: 0,index,user_id,id_user_post,name,timestamp,platform,img_id,hash,party
0,27,27,_andreaschwarz_,ig__andreaschwarz__CUE9BI8K46c,Andrea Schwarz,2021-09-21 10:43:30,Instagram,ig__andreaschwarz__CUE9BI8K46c,33430bfc74ab9c8c,unknown
1,79,79,_germanzero,ig__germanzero_CT35PD_qrBs,GermanZero,2021-09-16 09:00:23,Instagram,ig__germanzero_CT35PD_qrBs,074e38788687fc7c,unknown
2,90,90,_germanzero,ig__germanzero_CTFXm93KHDx,GermanZero,2021-08-27 18:04:27,Instagram,ig__germanzero_CTFXm93KHDx_1,edb10b0f9570f449,unknown
3,96,96,_germanzero,ig__germanzero_CTObUrhq6yX,GermanZero,2021-08-31 06:30:05,Instagram,ig__germanzero_CTObUrhq6yX_1,230ddcf31c1a73b4,unknown
4,104,104,_germanzero,ig__germanzero_CTh4QP8q8G6,GermanZero,2021-09-07 19:48:27,Instagram,ig__germanzero_CTh4QP8q8G6_3,937c6c83e4131ee6,unknown
