## __Image Analysis__ 

In [2]:
import pandas as pd
from pyaesthetics import analysis_1 as analysis
from tqdm import tqdm
import os
from concurrent.futures import ProcessPoolExecutor, as_completed

Thank you for using pyaesthetics. If you use it in your work, please cite:
Gabrieli, G., Bornstein, M. H., Setoh, P., & Esposito, G. (2023). Machine learning estimation of users’ implicit and explicit aesthetic judgments of web-pages. Behaviour & Information Technology, 42(4), 392-402.


In [None]:
# Define the function to perform analysis on a batch of images and update the results
def analyze_batch(batch):
    result_list = []
    for index, row in batch.iterrows():
        file_location = row['file_location']
        
        # Perform the image analysis
        analysis_result = analysis.analyze_image(file_location)  # Assuming this returns a dictionary
        
        # Flatten the dictionary if there are nested keys (subkeys)
        flattened_result = flatten_dict(analysis_result)
        # Add the index to track the row for later assignment
        flattened_result['index'] = index
        
        result_list.append(flattened_result)
    
    return result_list

# Function to flatten nested dictionary
def flatten_dict(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

# Function to process the data in batches of 100 and append the results to a single CSV file
def process_images_in_batches(csv_file, output_file="FinalImageAnalysisResults.csv"):
    # Load the CSV file
    df = pd.read_csv(csv_file)
    
    home_path = "C:/Users/txtbn"
    df["file_location"] = home_path + df["file_location"].astype(str)
    
    # Split the DataFrame into batches of 100 rows
    batch_size = 100
    batches = [df[i:i+batch_size] for i in range(0, len(df), batch_size)]
    
    # Initialize the process pool and the list to hold final results
    with concurrent.futures.ProcessPoolExecutor() as executor:
        future_to_batch = {executor.submit(analyze_batch, batch): batch for batch in batches}
        
        # For each batch, collect results and append to the CSV file
        for future in concurrent.futures.as_completed(future_to_batch):
            result_list = future.result()
            result_df = pd.DataFrame(result_list)
            
            # Drop the 'index' column (if it exists) before saving to the CSV file
            if 'index' in result_df.columns:
                result_df = result_df.drop(columns=['index'])
            
            # Append the batch result to the CSV file
            result_df.to_csv(output_file, mode='a', header=not os.path.exists(output_file), index=False)

# Call the function to process the images and append results to the same CSV file
process_images_in_batches("FinalImageData.csv")

In [2]:
df = pd.read_csv("FinalImageDataOutputTotal.csv")
df.dropna(how = "all", axis = 1, inplace = True)
df.columns
df.to_csv("FinalImageDataOutput.csv", encoding = "utf-8")

  df = pd.read_csv("FinalImageDataOutputTotal.csv")


## __Control Variables__

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm import tqdm

# Load sentiment model and tokenizer (Example: 'cardiffnlp/twitter-roberta-base-sentiment')
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

# Ensure model is on GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

In [8]:
# Load dataset
df = pd.read_csv("FinalImageDataOutputTotal.csv")

df["caption"].fillna(" ", inplace = True)

# Sentiment analysis function
def sentiment(text):
    if not isinstance(text, str) or text.strip() == "":
        return pd.Series([0, 0, 0], index=["negative", "neutral", "positive"])
    
    encoded_input = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    
    with torch.no_grad():
        output = model(**encoded_input)
    scores = output.logits.detach().cpu().numpy()[0]
    scores = softmax(scores)

    return pd.Series(scores, index=["negative", "neutral", "positive"])

# Apply sentiment analysis with progress bar
tqdm.pandas()
df[["negative", "neutral", "positive"]] = df["caption"].progress_apply(sentiment)

  df = pd.read_csv("FinalImageDataOutputTotal.csv")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["caption"].fillna(" ", inplace = True)
100%|██████████| 119822/119822 [21:48<00:00, 91.60it/s] 


In [None]:
import numpy as np
import emoji
import string
import re

count_emojis = lambda text: emoji.emoji_count(text) if isinstance(text, str) else 0
count_words = lambda text: sum(w.strip(string.punctuation).isalpha() for w in text.split()) if isinstance(text, str) else 0
count_hashtags = lambda text: len(re.findall(r"#", text)) if isinstance(text, str) else 0
count_mentions = lambda text: len(re.findall(r"@", text)) if isinstance(text, str) else 0

df["hashtag_count"] = df["caption"].apply(count_hashtags)
df["mention_count"] = df["caption"].apply(count_mentions)
df["emoji"] = df["caption"].apply(count_emojis)
df["length"] = df["caption"].apply(count_words)
df.drop("FacesCv2", axis=1, inplace=True)
df.dropna(how = "all", axis = 1, inplace = True)
df1 = df.dropna(subset = ['Colorfulness_HSV_Colorfulness_HSV', 'Colorfulness_HSV_Mean_H',
       'Colorfulness_HSV_Mean_S', 'Colorfulness_HSV_Mean_V',
       'Colorfulness_HSV_circular_mean_hue',
       'Colorfulness_HSV_circular_std_hue', 'Colorfulness_HSV_color_variety',
       'Colorfulness_HSV_std_H', 'Colorfulness_HSV_std_S',
       'Colorfulness_HSV_std_V', 'Colorfulness_RGB_Colorfulness_RGB',
       'Colorfulness_RGB_Mean_B', 'Colorfulness_RGB_Mean_G',
       'Colorfulness_RGB_Mean_R', 'Colorfulness_RGB_std_B',
       'Colorfulness_RGB_std_G', 'Colorfulness_RGB_std_R', 'Colors_Aqua',
       'Colors_Black', 'Colors_Blue', 'Colors_Fuchsia', 'Colors_Gray',
       'Colors_Green', 'Colors_Lime', 'Colors_Maroon', 'Colors_Navy',
       'Colors_Olive', 'Colors_Purple', 'Colors_Red', 'Colors_Silver',
       'Colors_Teal', 'Colors_White', 'Colors_Yellow'])

In [None]:
# List of columns to sort alphabetically
columns_to_sort = [
    'Symmetry_QTD', 'shape_n_line_hor', 'sharpness_sharp_laplacian', 'Colors_Lime',
    'Colorfulness_RGB_Mean_R', 'Colorfulness_HSV_Mean_V', 'Colorfulness_HSV_std_V', 'Colors_White',
    'Colors_Gray', 'VC_weight', 'Colorfulness_HSV_circular_std_hue', 'Colorfulness_RGB_Mean_G',
    'contrast_rms', 'Colors_Red', 'brightness_BT601', 'Colors_Yellow', 'texture_directionality',
    'Colorfulness_RGB_std_G', 'Colorfulness_HSV_circular_mean_hue', 'Colorfulness_RGB_Colorfulness_RGB',
    'height', 'selfSimilarity_ground', 'Colors_Teal', 'Colors_Silver', 'Colorfulness_HSV_std_S',
    'shape_n_line_slant', 'Colors_Blue', 'Colors_Black', 'Colorfulness_HSV_Colorfulness_HSV',
    'Colorfulness_HSV_Mean_H', 'Colors_Green', 'Colors_Purple', 'Colorfulness_RGB_std_R',
    'Colorfulness_HSV_Mean_S', 'VC_quadTree', 'VC_gradient', 'selfSimilarity_anisotropy',
    'Colors_Aqua', 'linesRatio', 'selfSimilarity_parent', 'shape_n_line', 'Colors_Navy',
    'selfSimilarity_neighbors', 'contrast_michelson', 'Colors_Olive', 'saturation',
    'texture_coarseness', 'Colorfulness_RGB_std_B', 'brightness_BT709', 'Colorfulness_RGB_Mean_B',
    'Colors_Fuchsia', 'width', 'Colorfulness_HSV_std_H', 'texture_contrast',
    'Colorfulness_HSV_color_variety', 'shape_n_line_ver', 'Colors_Maroon',
    'object_count', 'Number_of_Faces_Cv2'
]

# Sort the specified columns alphabetically
df[columns_to_sort] = df[columns_to_sort].reindex(sorted(columns_to_sort), axis=1)


In [4]:
df.to_csv("FinalImageDataAnalysis.csv", encoding = "utf-8", index = False)

## Getting Follower Counts

In [None]:
import tqdm
import json
import pandas as pd

df = pd.read_csv("FinalImageDataAnalysis.csv")

def reshape_company_profiles(df):
    return df.pivot_table(index='company', columns='platform', values='profile', aggfunc='first').reset_index()

df1 = reshape_company_profiles(df)

df1["facebook_link"] = df1["facebook"].apply(lambda profile: f"https://www.facebook.com/{profile}/")
df1["twitter_link"] = df1["twitter"].apply(lambda profile: f"https://twitter.com/{profile}")

# Load the JSON file
print("Loading JSON File")
with open("twitter_scrape_output.json", "r", encoding="utf-8") as file:
    tweets_data = json.load(file)

# Example DataFrame with Twitter usernames (assuming df1 is already defined)
# df1 = pd.DataFrame({'profile': ['UnitedHealthGrp', 'AnotherUser']})

# Create a dictionary to store the follower counts
follower_counts = {}

# Create a set to keep track of processed usernames
processed_usernames = set()

print("Tweets")
# Loop through each tweet in the JSON data with tqdm for progress visualization
for tweet in tqdm.tqdm(tweets_data, desc="Processing tweets", unit="tweet"):
    # Extract author information with error handling in case some data is missing
    author = tweet.get('author', {})
    username = author.get('userName')
    
    # Skip if username is not present or if it has already been processed
    if not username or username in processed_usernames:
        continue
    
    # Check if the author's username is in the DataFrame
    if username in df1['twitter'].values:
        # Get the follower count for this author, with a default if the key is missing
        follower_count = author.get('followers', None)
        if follower_count is not None:
            # Add to dictionary and mark username as processed
            follower_counts[username] = follower_count
            processed_usernames.add(username)

# Add the follower count to the DataFrame, matching by the username
df1['follower_count'] = df1['twitter'].map(follower_counts)

Loading JSON File


In [None]:
df1 = pd.read_csv("company_profiles.csv")

facebook_followers = pd.read_csv("facebook_followers.csv")
instagram_followers = pd.read_csv("instagram_followers.csv")

facebook_followers.rename(columns = {"pageName": "facebook", "followers": "facebook_followers"}, inplace = True)
instagram_followers.rename(columns = {"username": "instagram","followersCount":"instagram_followers"}, inplace = True)
df1.rename(columns = {"follower_count": "twitter_followers"}, inplace = True)

df1 = df1.merge(facebook_followers, how="left", on="facebook")
df1 = df1.merge(instagram_followers, how="left", on="instagram")

df1.to_csv("company_profiles.csv", encoding = "utf=8", index = False)

In [24]:
df1.columns

Index(['Unnamed: 0', 'company', 'facebook', 'instagram', 'twitter',
       'facebook_link', 'twitter_link', 'twitter_followers',
       'facebook_followers_x', 'instagram_followers_x', 'facebook_followers_y',
       'instagram_followers_y'],
      dtype='object')

In [None]:
df = pd.read_csv("FinalImageDataAnalysis.csv")
df1 = pd.read_csv("company_profiles.csv")
df2 = df1[["company", "facebook_followers", "instagram_followers", "twitter_followers"]]
df3 = df.merge(df2, how = "left", on = "company")

  df = pd.read_csv("FinalImageDataAnalysis.csv")


In [3]:
import pandas as pd
df = pd.read_csv("FinalImageDataAnalysis.csv")
df["num_comments"].dtype

  df = pd.read_csv("FinalImageDataAnalysis.csv")


dtype('int64')

10 Images with resizing: 100 seconds
10 Images with retaining aspect ratio: 100 seconds

__OLD CODE__

I was originally using Athec to code for image features but it was incredibly inefficient. Each function reads in the image one at a time, whereas pyaesthetics reads it in once and then conducts the analyses as needed. I have kept the code in case something goes wrong.

Colourfulness

In [9]:
# Initialize lists to hold parsed results
colorful_hs_values = []
colorful_d_values = []
contrast_values = []


# Iterate over each image path
for image in df1["file_path"]:
    try:
        # Process each image with the color attributes
        colorful_hs = color.attr_colorful(image)
        colorful_d = color.attr_colorful_emd(image)
        contrast = color.attr_contrast_peak(image)

        # Append the results for colorful attributes
        colorful_hs_values.append(colorful_hs)
        colorful_d_values.append(colorful_d)

        # Parse the contrast dictionary and collect parsed values
        contrast_parsed = parse_measures(contrast)
        contrast_values.append(contrast_parsed)

    except Exception as e:
        # Handle any error gracefully (e.g., invalid image path)
        print(f"Error processing {image}: {e}")
        colorful_hs_values.append(None)
        colorful_d_values.append(None)
        contrast_values.append(None)

# Assign the collected colorful values to new columns
df1["colorful_hs"] = colorful_hs_values
df1["colorful_d"] = colorful_d_values
add_measure_columns(contrast_values)



In [6]:
df1.columns

Index(['index', 'company', 'platform', 'post_id', 'post_date', 'type',
       'caption', 'num_likes', 'num_comments', 'num_shares', 'file_path',
       'colorful_hs', 'colorful_d', 'contrast_n_peak',
       'contrast_peak_distance', 'contrast_peak_list'],
      dtype='object')

Complexity

In [10]:
emap_values = []
box_values = []
normal_values = []

for image in df1["file_path"]:
    edges = edge.tf_edge_canny(image)
    complexity_emap = edge.attr_complexity_edge(edges)
    complexity_box = edge.attr_complexity_edge(edges)
    #segment_nc = segment.tf_segment_normalized_cut(image)
    #complexity_normal = segment.attr_complexity_segment(segment_nc)

    complexity_emap_parsed = parse_measures(complexity_emap)
    complexity_box_parsed = parse_measures(complexity_box)
    #complexity_normal_parsed = parse_measures(complexity_normal)
    
    emap_values.append(complexity_emap_parsed)
    box_values.append(complexity_box_parsed)
    #normal_values.append(complexity_normal_parsed)

add_measure_columns(emap_values, method_naming= True)
add_measure_columns(box_values, method_naming= True)
#add_measure_columns(normal_values, method_naming= True)

In [11]:
df1.columns

Index(['index', 'company', 'platform', 'post_id', 'post_date', 'type',
       'caption', 'num_likes', 'num_comments', 'num_shares', 'file_path',
       'colorful_hs', 'colorful_d', 'contrast_n_peak',
       'contrast_peak_distance', 'contrast_peak_list', 'emap_edge_density',
       'emap_edge_distance', 'box_edge_density', 'box_edge_distance'],
      dtype='object')

Balance

In [None]:
balance_values = []

for image in df1["file_path"]:
    saliency_spiral1 = saliency.tf_saliency_spectral_residual(image)
    balance_measure = saliency.attr_ruleofthirds_centroid(saliency_spiral1)
    balance_measure_parsed = parse_measures(balance_measure)
    balance_values.append(balance_measure_parsed)

add_measure_columns(balance_values, method_naming= True)
    
df1.columns

In [None]:
'''
Calculate measures of rule of thirds based on saliency values that fall within thirds bands and intersections.
Return:
(1) saliency weights in the two vertical thirds bands and the maximal of the two.
(2) saliency weights in the two horizontal thirds bands and the maximal of the two.
(3) saliency weights in the four intersection rectangles and the maximal of the four.
save_path (optional, default None): str. If provided, a visualization will be saved to this location.
'''

result = saliency.attr_ruleofthirds_band(saliency_spectral, 
                                         save_path = os.path.join(tf_folder, "ruleofthirds band saliency spectral", imgname) )

misc.printd(result)

## __Test Code__

### Sentiment Analysis

In [13]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)
text = "I am so happy, I love everything, yay"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)
# text = "Covid cases are increasing fast!"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)
# Print labels and scores
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1) positive 0.9827
2) neutral 0.0087
3) negative 0.0086
