# Creating vignettes (no names)

This code is used to create all vignettes used in the study. It contains two main components. First, the information used in the vignettes is collated into a table with information on each tweet (and reply as required). Second, a script is used to create PNG image files based on this information, one for each vignette.

In [None]:
from collections import defaultdict, Counter
from random import sample, seed
import random
import glob
import pandas as pd
import numpy as np
import json

In [None]:
seed(854014850)

Hate speech will be oversampled anyway since there are many kinds of hate but only one kind for B and C. 

In [None]:
interaction_type = ["B", "H"] # Different types of posts, benign or hate
cursing = ["T", "F"]
identity = ["WM", "WF", "BM", "BF", "A"] # Different identities of OP
contexts = ["sport", "politics", "entertainment", "everyday", "work"] # Different discussion matters
engagement = ["L", "H"] # Low or high likes
reply = ["None", "Agree", "Disagree"] # Reply is always by anon
slurs = ['asshole', 'bitch', 'nigga', 'nigger','faggot', 'cracker']
# Generic, sexist, racist/reclaimed, racist, homophobic, white

V = defaultdict(dict)

counter = 0
for i in interaction_type:
    for cu in cursing:
        for j in identity:
            for c in contexts:
                for e in engagement:
                    for r in reply:
                        if i == "H":
                            for s in slurs:
                                results = {'interaction': i, 
                                           "curse": cu,
                                          "identity": j,
                                          "context": c,
                                          "engagement": e,
                                          "reply": r,
                                          "slur": s}
                                V[counter] = results
                                counter += 1
                        else:
                                results = {'interaction': i, 
                                           "curse": cu,
                                          "identity": j,
                                          "context": c,
                                          "engagement": e,
                                          "reply": r,
                                          "slur": "NA"}
                                V[counter] = results
                                counter += 1

In [None]:
len(V)

In [None]:
df = pd.DataFrame.from_dict(V, orient = 'index')

## Identities

Racialized names from Gaddis 2017.

In [None]:
names = {
    "WM": ["AJ", "JR", "DJ", "EV", "PK", "GT", "SV", "JP", "KC", "RK"],
    "WF": ["AJ", "JR", "DJ", "EV", "PK", "GT", "SV", "JP", "KC", "RK"],
    "BM": ["AJ", "JR", "DJ", "EV", "PK", "GT", "SV", "JP", "KC", "RK"],
    "BF": ["AJ", "JR", "DJ", "EV", "PK", "GT", "SV", "JP", "KC", "RK"],
    "A": ["AJ", "JR", "DJ", "EV", "PK", "GT", "SV", "JP", "KC", "RK"]
}

Creating usernames. Using a few different patterns (function created with assistance from GPT4 https://chat.openai.com/share/361573fe-adef-454f-95a9-9e599263da02)

In [None]:
import random

def generate_username(name):
    patterns = [
        lambda n: n + str(random.randint(1, 999)),  # Adding random number at the end
        lambda n: n + "_" + str(random.randint(1, 999)),  # Adding underscore and random number
        lambda n: "TheReal" + n,
        lambda n: n + "Official",
        lambda n: "Mr_" + n if name in names["WM"] or name in names["BM"] else ("Ms_" + n if name not in names["A"] else "M_" + n),
        lambda n: n.lower() + "_" + str(random.randint(1, 99)),  # Full lowercase
        lambda n: n + "_" + random.choice(["Love", "World", "Tweets", "Zone", "Life"])  # First name with popular word
    ]
    return random.choice(patterns)(name)

usernames = {}
for group, group_names in names.items():
    usernames[group] = [generate_username(name) for name in group_names]

print(usernames)

Now getting images. Using same indices, each user can now be associated with a name, face, and username. These images are the final versions selected using a random sample from a larger corpus of images provided by https://generated.photos for academic research.

In [None]:
wf_faces = glob.glob("faces/wf/*")
bm_faces = glob.glob("faces/bm/*")
bf_faces = glob.glob("faces/bf/*")
wm_faces = glob.glob("faces/wm/*")
anon_faces = glob.glob("faces/anon/*")

In [None]:
def getUserInfo(group, idx):
    """Given a specified group and index this command will return 
    a name, username, and path to an image for a specified user"""
    if group == "BF":
        return(names["BF"][idx], usernames["BF"][idx], bf_faces[idx])
    if group == "BM":
        return(names["BM"][idx], usernames["BM"][idx], bm_faces[idx])
    if group == "WF":
        return(names["WF"][idx], usernames["WF"][idx], wf_faces[idx])
    if group == "WM":
        return(names["WM"][idx], usernames["WM"][idx], wm_faces[idx])
    else:
        return(names["A"][idx], usernames["A"][idx], anon_faces[idx])

## Loading text templates

In [None]:
with open('politics.json', 'r') as file:
    # Load the JSON data from the file
    politics = json.load(file)
    
with open('sports.json', 'r') as file:
    # Load the JSON data from the file
    sports = json.load(file)
    
with open('entertainment.json', 'r') as file:
    # Load the JSON data from the file
    ent = json.load(file)
    
with open('work.json', 'r') as file:
    # Load the JSON data from the file
    work = json.load(file)

with open('interpersonal.json', 'r') as file:
    # Load the JSON data from the file
    everyday = json.load(file)

## Expanding dataset

In [None]:
V2 = defaultdict(dict)

counter = 0

for k, v in V.items():  # For each entry in the dictionary
    for i in range(0, 10):
        for j in range(0, 10):
            results = v.copy()
            results['user_id'] = i
            results['scenario_id'] = j
            
            if v['reply'] != "None":
                if v['identity'] == "A":
                    # If reply then randomly select from anon, excluding original poster
                    vals = list(range(0, 10))
                    vals.remove(i)
                    results['reply_id'] = random.choice(vals)
                else:
                    # Otherwise select at random from anon
                    results['reply_id'] = random.choice(list(range(0, 10)))
            
            V2[counter] = results
            counter += 1

In [None]:
len(V2)

In [None]:
df2 = pd.DataFrame.from_dict(V2, orient = 'index')

## Compiling final texts

In [None]:
def insertCurse(convo, curse):
    convo['statement'] = convo['statement'].replace('ADJ', curse)
    return convo

def removeCurse(convo):
    convo['statement'] = convo['statement'].replace('ADJ ', "") # Note: Also removing additional whitespace
    return convo

def insertSlur(convo, slur):
    # Get the statement from the conversation dictionary
    sentence = convo['add_negative']
    
    # Check for the character after ADJ
    idx = sentence.find("BAD")
    if idx != -1:  # If "ADJ" is found
        char_after_ADJ = sentence[idx + 3] if idx + 3 < len(sentence) else None
        
        # Modifying if slur is bitch to ensure appropriate suffix
        if char_after_ADJ == 's' and slur[-1] == 'h':
            slur = slur + 'e'
        
        # capitalizing if at beginning of sentence
        if idx == 0: 
            slur = slur.capitalize()
            
    # Replace "ADJ" with x in the statement
    modified_sentence = sentence.replace("BAD", slur)
    convo['add_negative'] = modified_sentence
    
    return convo

def processTexts(interaction, context, context_id, reply, slur, curse):
    """This function creates a text according to the relevant context."""
    # Step 1: Sample a random interaction from relevant domain
    if context == "politics":
        text = politics['conversations'][context_id].copy()
    if context == "sport": 
        text = sports['conversations'][context_id].copy()
    if context == "entertainment": 
        text = ent['conversations'][context_id].copy()
    if context == "everyday": 
        text = everyday['conversations'][context_id].copy()
    if context == "work": 
        text = work['conversations'][context_id].copy()
    
    # Step 2: Add curse or remove placeholder
    if curse == "T":
        text = insertCurse(text, "fucking")
    else:
        text = removeCurse(text)
    
    # Step 3: Add slur if hateful and combine two parts of statement
    if interaction == "H":
        text = insertSlur(text, slur)
        statement = text['statement'] + " " + text['add_negative']
    else:
        statement = text['statement'] + " " + text['add_neutral']
    
    # Step 4: Return with replies depending on type
    if reply == "None":
        return(statement, "")
    elif reply == "Agree":
        return(statement, text['endorse'])
    else:
        return(statement, text['disagree'])

# Constructing posts

Using modified code from https://github.com/TiagoVentura/conjoints_tweets/tree/main

Code has been adapted to generate posts in the required format and  include color emojis

In [None]:
from matplotlib import font_manager
from PIL import Image, ImageDraw, ImageFont
from textwrap import wrap
import os
import re
import datetime
from numpy import asarray
import warnings
warnings.filterwarnings('ignore')
from pilmoji import Pilmoji
import pilmoji
import emoji

# %%
# font

font_regular_file = '/Library/Fonts/Arial.ttf'
font_bold_file = '/Library/Fonts/Arial.ttf'

# TODO: Still imperfect. Need to figure out how to render emojis AND other text

if font_regular_file[-4:] == '.ttc':
    font_author_name = ImageFont.truetype(font_bold_file, size=35, index=1)
    font_author_tag = ImageFont.truetype(font_regular_file, size=25, index=0)
    font_text = ImageFont.truetype(font_regular_file, size=35, index=0)
    font_time_date = ImageFont.truetype(font_regular_file, size=24, index=0)
    font_reaction_regular = ImageFont.truetype(font_regular_file, size=30, index=0)
    font_reaction_bold = ImageFont.truetype(font_bold_file, size=30, index=1)
    font_quote_author_name = ImageFont.truetype(font_bold_file, size=27, index=1)
    font_quote_author_tag = ImageFont.truetype(font_regular_file, size=27, index=0)
    font_quote_text = ImageFont.truetype(font_regular_file, size=25, index=0)
    font_reply_author_name = ImageFont.truetype(font_bold_file, size=30, index=1)
    font_reply_author_tag = ImageFont.truetype(font_regular_file, size=30, index=0)
    font_reply_text = ImageFont.truetype(font_regular_file, size=30, index=0)
else:
    font_author_name = ImageFont.truetype(font_bold_file, size=35)
    font_author_tag = ImageFont.truetype(font_regular_file, size=25)
    font_text = ImageFont.truetype(font_regular_file, size=35)
    font_time_date = ImageFont.truetype(font_regular_file, size=24)
    font_reaction_regular = ImageFont.truetype(font_regular_file, size=30)
    font_reaction_bold = ImageFont.truetype(font_bold_file, size=30)
    font_quote_author_name = ImageFont.truetype(font_bold_file, size=27)
    font_quote_author_tag = ImageFont.truetype(font_regular_file, size=27)
    font_quote_text = ImageFont.truetype(font_regular_file, size=25)
    font_reply_author_name = ImageFont.truetype(font_bold_file, size=30)
    font_reply_author_tag = ImageFont.truetype(font_regular_file, size=30)
    font_reply_text = ImageFont.truetype(font_regular_file, size=30)

# %%
# global parameters
# positions
author_avatar_position = (35, 40)
author_name_position = (170, 40)
author_tag_position = (170, 95)
quote_author_avatar_position = (50, 40)
quote_author_name_position = (120, 45)
reply_author_avatar_position = (25, 30)
reply_author_name_position = (130, 30)
time_date_position = (35, 10)
reaction_retweet_position = (40, 80)  # Retweets
reaction_quote_position = (245, 80)  # Quote Tweets
reaction_like_position = (525, 80)  # Likes
text_position = (35, 0)
quote_text_position = (55, 0)
reply_text_position_y_adjust = -30
reply_text_position = (130, 0)
# measurements
header_height = 170
footer_height = 220
text_line_height = 45
quote_header_height = 100
quote_footer_height = 35
quote_text_line_height = 35
reply_header_height = 100
# reply_footer_height = 69
reply_text_line_height = 35
# backgrounds
header = Image.open('input/twitter_module/header.png').convert('RGB')
footer = Image.open('input/twitter_module/footer.png').convert('RGB')
quote_header = Image.open('input/twitter_module/quote_header.png').convert('RGB')
quote_background = Image.open('input/twitter_module/quote_background.png').convert('RGB')
quote_footer = Image.open('input/twitter_module/quote_footer.png').convert('RGB')
reply_header = Image.open('input/twitter_module/reply_header.png').convert('RGB')
# reply_footer = Image.open('input/twitter_module/reply_footer.png').convert('RGB')


# %%
# function to create tweet

def CreateTweet(
        author_avatar: str = "input/avatar/woman_clean.png",
        author_name: str = "User1",
        author_tag: str = "@user1",
        text: str = "Content of tweet.",
        reactions_retweet: str = "100",
        reactions_quote: str = "200",
        reactions_like: str = "20K",
        time: str = None,
        quote: bool = False,
        quote_author_avatar: str = "input/avatar/woman_clean.png",
        quote_author_name: str = "User2",
        quote_author_tag: str = "@user2",
        quote_text: str = "Content of quoted tweet.",
        reply: bool = False,
        reply_author_avatar: str = "input/avatar/woman_clean.png",
        reply_author_name: str = "User3",
        reply_author_tag: str = "@user3",
        reply_text: str = "Content of reply."
        ):
    """
    Create tweet using parameters.

    Parameters:
    author_avatar (str): avatar of author
    author_name (str): name of author
    author_tag (str): twitter username/handle of author
    text (str): main text of tweet
    reactions_retweet (str): number of reactions of tweet
    reactions_quote (str): number of quotes of tweet
    reactions_like (str): number of likes of tweet
    time (str/None): time of tweet in format "2022-07-05 14:34"; if None use current time
    quote (True/False): whether or not to print quoted tweet
    quote_author_avatar (str): avatar of author of quoted tweet
    quote_author_name (str): name of author of quoted tweet
    quote_author_tag (str): twitter username/handle of author of quoted tweet
    quote_text (str): text of quoted tweet
    reply (True/False): whether or not to print replied tweet
    reply_author_avatar (str): avatar of author of replied tweet
    reply_author_name (str): name of author of replied tweet
    reply_author_tag (str): twitter username/handle of author of replied tweet
    reply_text (str): text of replied tweet

    Returns:
    image: Twitter image in PIL format
    """

    # blank image to paste elements
    # calulate the number of lines for text and quote text
    text_string_lines = wrap(text, 54)
    quote_text_string_lines = wrap(quote_text, 77)
    reply_text_string_lines = wrap(reply_text, 70)
    height = header_height + footer_height + text_line_height * len(text_string_lines)
    if quote:
        height += quote_header_height + quote_footer_height + quote_text_line_height * len(quote_text_string_lines)
    if reply:
        height += reply_header_height + reply_text_line_height * len(reply_text_string_lines)
    img = Image.new(mode="RGB", size=(1050, height), color=(256, 256, 256))
    # header (include avatar and name), height=170
    img.paste(im=header, box=(0, 0))
    # author avatar
    author_avatar = Image.open(author_avatar).resize((100, 100))
    img.paste(im=author_avatar, box=author_avatar_position)
    # author name
    draw = ImageDraw.Draw(img)
    draw.text(xy=author_name_position, text=author_name,
              font=font_author_name, fill=(0, 0, 0))
    # author tag
    draw.text(xy=author_tag_position, text=author_tag,
              font=font_author_tag, fill="#667786")
    # text of main tweet
    # separate text into lists of each line and calculate y position for each line
    y = header_height
    for index, line in enumerate(text_string_lines):  # get the index and the text
        
        with Pilmoji(img) as pilmoji:
            pilmoji.text(
                xy=tuple(map(sum, zip(text_position, (0, y)))),
                text=line, font=font_text, fill=(0, 0, 0))
            y += text_line_height
    if quote:
        # quote header (include avatar and name), height=98
        img.paste(im=quote_header, box=(0, y))
        # quote author avatar
        quote_author_avatar = Image.open(quote_author_avatar).resize((50, 50))
        img.paste(
            im=quote_author_avatar,
            box=tuple(map(sum, zip(quote_author_avatar_position, (0, y)))))
        # quote author name
        draw.text(
            xy=tuple(map(sum, zip(quote_author_name_position, (0, y)))),
            text=quote_author_name, font=font_quote_author_name, fill=(0, 0, 0))
        # quote author tag
        quote_author_name_width = draw.textlength(
            text=quote_author_name,
            font=font_quote_author_name)
        draw.text(
            xy=tuple(map(sum, zip(quote_author_name_position,
                     (10+quote_author_name_width, y)))),
            text=quote_author_tag, font=font_quote_author_tag, fill="#667786")
        # quote text
        # separate text into lists of each line and calculate y position for each line
        # get the index and the text
        y += quote_header_height
        for index, line in enumerate(quote_text_string_lines):
            img.paste(im=quote_background, box=(0, y-2))
            draw.text(
                xy=tuple(map(sum, zip(quote_text_position, (0, y)))),
                text=line, font=font_quote_text, fill=(0, 0, 0))
            y += quote_text_line_height
        img.paste(im=quote_footer, box=(0, y-2))
        y += quote_footer_height
    # footer (include reactions), height=260
    img.paste(im=footer, box=(0, y))
    # time
    if time is None:
        # get time from right now if no time given
        time = datetime.datetime.now()
        time.strftime("%Y-%m-%d %I:%M")
    else:
        # parse time according to "2022-07-05 14:34" format
        time = datetime.datetime.strptime(time, "%Y-%m-%d %H:%M")
    # format time to date and time in tweet format
    tweet_time = time.strftime("%-I:%M %p")
    tweet_date = time.strftime("%b %-d, %Y")
    time_date_text = tweet_time + " · " + tweet_date + " " + " "
    draw.text(xy=tuple(map(sum, zip(time_date_position, (0, y)))),
              text=time_date_text, font=font_time_date, fill="#667786")
    # reactions
    # retweets
    draw.text(xy=tuple(map(sum, zip(reaction_retweet_position, (0, y)))),
              text=reactions_retweet, font=font_reaction_bold, fill=(0, 0, 0))
    x_reactions = draw.textlength(text=reactions_retweet,
                                font=font_reaction_bold) + 10
    draw.text(xy=tuple(map(sum, zip(reaction_retweet_position, (x_reactions, y)))),
              text="Likes", font=font_reaction_regular, fill="#667786")
    x_reactions += draw.textlength(text="Likes",
                                 font=font_reaction_regular) + 30

    if reply:
        y += footer_height
        # reply header (include avatar and name), height=126
        img.paste(im=reply_header, box=(0, y))
        # reply author avatar
        reply_author_avatar = Image.open(reply_author_avatar).resize((90, 90))
        img.paste(
            im=reply_author_avatar,
            box=tuple(map(sum, zip(reply_author_avatar_position, (0, y)))))
        # reply author name
        draw.text(
            xy=tuple(map(sum, zip(reply_author_name_position, (0, y)))),
            text=reply_author_name, font=font_reply_author_name, fill=(0, 0, 0))
        # reply author tag
        reply_author_name_width = draw.textlength(
            text=reply_author_name,
            font=font_reply_author_name)
        draw.text(
            xy=tuple(map(sum, zip(reply_author_name_position,
                                  (10+reply_author_name_width, y)))),
            text=reply_author_tag, font=font_reply_author_tag, fill="#667786")
        # reply text
        # separate text into lists of each line and calculate y position for each line
        # get the index and the text
        y += reply_header_height
        y += reply_text_position_y_adjust
        for index, line in enumerate(reply_text_string_lines):
            
            with Pilmoji(img) as pilmoji:
                pilmoji.text(

                    xy=tuple(map(sum, zip(reply_text_position, (0, y)))),
                    text=line, font=font_reply_text, fill=(0, 0, 0))
            y += reply_text_line_height
        # img.paste(im=reply_footer, box=(0, y-2))
        # y += reply_footer_height
        

    return img

# %%
# function to save tweet image (easier to call in R)

def SaveTweet(image, path, quality=95):
    """
    Save PIL image to path.

    Parameters:
    image (PIL Image): image of tweet
    path (str): path to save the image, should end with .png, .jpg, or .pdf
    quality (int): image quality, on a scale from 0 (worst) to 95 (best)

    Returns:
    None
    """
    image.save(path, quality=quality)

# %%
# function to show image in R

def Convert(image_PIL):
    """
    Convert image from PIL object to array of RGB between 0 and 1.

    Parameters:
    image_PIL (PIL Image): image of tweet

    Returns:
    image in numpy array of RGB between 0 and 1
    """
    pixels = asarray(image_PIL)
    pixels = pixels.astype('float32')
    pixels /= 255.0
    
    return pixels

In [None]:
# Helper function to create random date strings for the tweets

import random
from datetime import timedelta

def random_date_string(start_date_str, end_date_str):
    # Convert start and end date strings to datetime objects
    start_date = datetime.datetime.strptime(start_date_str, '%Y-%m-%d')
    end_date = datetime.datetime.strptime(end_date_str, '%Y-%m-%d')

    # Calculate the difference between start and end dates
    delta = end_date - start_date

    # Generate a random number of days to add to the start date
    random_days = datetime.timedelta(days=random.randint(0, delta.days))

    # Generate a random date within the range
    random_date = start_date + random_days

    # Generate a random time between 08:00 and 23:59
    random_hour = random.randint(8, 23)
    random_minute = random.randint(0, 59)
    random_time = datetime.time(random_hour, random_minute)

    # Combine the random date and random time
    new_date = datetime.datetime.combine(random_date, random_time)

    return new_date.strftime('%Y-%m-%d %H:%M')

# Example usage:
start_date_str = '2023-11-01'
end_date_str = '2024-05-01'

random_date = random_date_string(start_date_str, end_date_str)
print(random_date)

In [None]:
%%time

# Making files and storing additional info in the dictionary

for i, v in V2.items():
    
    if i % 1000 == 0:
        print(i) # Every 1k rows

    # Extracting info and creating vignettes    
    name, username, img = getUserInfo(v['identity'], v['user_id'])

    texts = processTexts(v['interaction'], v['context'], v['scenario_id'], v['reply'], v['slur'], v['curse'])


    V2[i]['op_name'] = name
    V2[i]['op_username'] = username
    V2[i]['op_img'] = img

    V2[i]['op_text'] = texts[0]


    if v['engagement'] == "H":
        likes = random.sample(range(25,50), 1)[0]
    else:
        likes = random.sample([0,1,2,3,4,5], 1)[0]

    V2[i]['engagement_val'] = likes

    dt = random_date_string(start_date_str, end_date_str)

    V2[i]['date'] = dt


    if v['reply'] == "None":



        tweet = CreateTweet(
                author_avatar=img,
                author_name=name,
                author_tag="@" + username,
                text=texts[0],
                reactions_retweet=str(likes),
                time= dt,
            )

    else:

        rname, rusername, rimg = getUserInfo('A', int(v['reply_id']))
        op_comment = texts[0]
        reply_comment = texts[1]

        V2[i]['r_name'] = rname
        V2[i]['r_username'] = rusername
        V2[i]['r_img'] = rimg

        V2[i]['r_text'] = texts[1]

        tweet = CreateTweet(
                author_avatar=img,
                author_name=name,
                author_tag="@" + username,
                text=texts[0],
                reactions_retweet=str(likes),
                time=dt,
                reply=True,
                reply_author_avatar=rimg,
                reply_author_name=rname,
                reply_author_tag=rusername,
                reply_text=texts[1]
            )
    # Ensuring numbering starts at 1 not 0
    i = i+=1
    
    SaveTweet(tweet, "../synthetic_posts/nameless/tweet" + str(i) +  ".png", quality = 95)
    V2[i]['filename'] = "output/tweet" + str(i) +  ".png" # Note: This was directory during analysis but it has been modified to above for replication

In [None]:
i

In [None]:
df3 = pd.DataFrame.from_dict(V2, orient = 'index')

In [None]:
df3.to_csv("post-information-nameless.tsv") # Saving a table with info on each post

In [None]:
# Verifying all files are present in case any errors during post creation
directory_path = '../synthetic_posts/nameless/tweet'

# Generate a set of expected filenames
expected_files = {f"tweet{i}.png" for i in range(1, 120001)}

# Get a set of actual filenames in the directory
actual_files = set(os.listdir(directory_path))

# Identify any missing files
missing_files = expected_files - actual_files

# Check if there are any missing files and print them
if missing_files:
    print(f"Missing files: {sorted(missing_files)}")
else:
    print("No missing files")

In [None]:
len(actual_files)