## Task 2
We are looking at Private individuals vs Business actors again

In [5]:
import os
import json
import numpy as np
import pandas as pd
import imagehash
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image
from image_analysis.data import get_images_dataframe, get_images, get_image
from sdm.config import get_db_connection

In [25]:
conn = get_db_connection(db_type="sqlite", db_path="../../../../data/twitter.db")
df = get_images_dataframe(db=conn)
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38980 entries, 0 to 38979
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   account_id    38980 non-null  object
 1   image_id      38980 non-null  object
 2   account_type  38980 non-null  object
 3   lang          38729 non-null  object
 4   stance        38441 non-null  object
dtypes: object(5)
memory usage: 1.5+ MB
None


Unnamed: 0,account_id,image_id,account_type,lang,stance
0,8508262,,Private individuals,fr,For
1,8508262,,Private individuals,fr,For
2,8508262,,Private individuals,fr,For
3,8508262,,Private individuals,fr,For
4,8508262,,Private individuals,fr,For


In [26]:
df = df[df["image_id"] != "nan"]
df = df.dropna(subset=["image_id"])
df["media_keys"] = df["image_id"].apply(
    lambda x: json.loads(x.replace("'", '"')).get("media_keys") if "media_keys" in x else None
)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,account_id,image_id,account_type,lang,stance,media_keys
0,8508262,{'media_keys': ['3_669872845984546817']},Private individuals,fr,For,[3_669872845984546817]
1,8508262,{'media_keys': ['3_671208626439213056']},Private individuals,fr,For,[3_671208626439213056]
2,8508262,{'media_keys': ['3_671223094661545984']},Private individuals,fr,For,[3_671223094661545984]
3,8508262,{'media_keys': ['3_671231051176517632']},Private individuals,fr,For,[3_671231051176517632]
4,8508262,{'media_keys': ['3_671231164783398913']},Private individuals,fr,For,[3_671231164783398913]


## Use perceptual hashing as analysis method

In [1]:
df["images"] = df["media_keys"].apply(lambda keys: get_images(media_keys=keys, media_dir="../../../../data/media"))
df.head()

NameError: name 'df' is not defined

In [2]:
df = df[df["images"].apply(lambda x: len(x) > 0 & isinstance(x, list))]
df.info()

NameError: name 'df' is not defined

In [3]:
def get_image_hash(media_keys: list, media_dir="../../../../data/media"):
    if not os.path.exists(media_dir):
        raise FileNotFoundError(f"The directory '{media_dir}' does not exist.")
    possible_extensions = [".jpg", ".png"]

    key = media_keys[0]
    found = False
    for ext in possible_extensions:
        image_path = os.path.join(media_dir, f"{key}{ext}")
        if os.path.exists(image_path):
            try:
                img = Image.open(image_path)
                phash = imagehash.phash_simple(img)
                found = True
                return phash
            except Exception as e:
                print(f"[!] Failed to open {image_path}: {e}")
                return None
    if not found:
        print(f"[*] No image file found for key '{key}' in directory '{media_dir}'.")
        return None

df["phash"] = df["media_keys"].apply(get_image_hash)  # only use first image (more simple)
df.info()

NameError: name 'df' is not defined

In [33]:
df = df.dropna(subset=["phash"])
df.to_csv("../../../../data/phash_imgs.csv")

8576

In [6]:
df = pd.read_csv("../../../../data/phash_imgs.csv")
len(df["phash"].unique())

8576

In [7]:
# inspect images under same ID
# e0e0e0e0f2f4fcec, 8080808080c08e88, 0a0b0f0f1f1f0f7f, af2727a7a7a7a727, 9999999999999999
max = 5
count = 0
for _, r in df[df["phash"] == "8080808080c08e88"].iterrows():
    imgs = get_images(eval(r["media_keys"]), media_dir="../../../../data/media")
    imgs[0].show()
    count += 1
    if count >= max:
        break

In [11]:
df[["phash", "image_id"]].groupby("phash").count().sort_values("image_id", ascending=False)
df[df["phash"] == "8080808080c08e88"]

Unnamed: 0.1,Unnamed: 0,account_id,image_id,account_type,lang,stance,media_keys,images,phash
6505,6670,3911870127,{'media_keys': ['3_674254206774153217']},Private individuals,en,Unclear,['3_674254206774153217'],[<PIL.JpegImagePlugin.JpegImageFile image mode...,8080808080c08e88
6506,6671,3911870127,{'media_keys': ['3_674254206774153217']},Private individuals,en,Unclear,['3_674254206774153217'],[<PIL.JpegImagePlugin.JpegImageFile image mode...,8080808080c08e88
6507,6672,3911870127,{'media_keys': ['3_674254206774153217']},Private individuals,en,Unclear,['3_674254206774153217'],[<PIL.JpegImagePlugin.JpegImageFile image mode...,8080808080c08e88
6508,6673,3911870127,{'media_keys': ['3_674254206774153217']},Private individuals,en,Unclear,['3_674254206774153217'],[<PIL.JpegImagePlugin.JpegImageFile image mode...,8080808080c08e88
6509,6674,3911870127,{'media_keys': ['3_674254206774153217']},Private individuals,en,Unclear,['3_674254206774153217'],[<PIL.JpegImagePlugin.JpegImageFile image mode...,8080808080c08e88
...,...,...,...,...,...,...,...,...,...
11355,11868,3881648534,{'media_keys': ['3_675351147100676096']},Private individuals,nl,Unclear,['3_675351147100676096'],[<PIL.JpegImagePlugin.JpegImageFile image mode...,8080808080c08e88
11356,11869,3881648534,{'media_keys': ['3_675351147100676096']},Private individuals,nl,Unclear,['3_675351147100676096'],[<PIL.JpegImagePlugin.JpegImageFile image mode...,8080808080c08e88
11357,11870,3881648534,{'media_keys': ['3_675351147100676096']},Private individuals,nl,Unclear,['3_675351147100676096'],[<PIL.JpegImagePlugin.JpegImageFile image mode...,8080808080c08e88
11358,11871,3881648534,{'media_keys': ['3_675351147100676096']},Private individuals,nl,Unclear,['3_675351147100676096'],[<PIL.JpegImagePlugin.JpegImageFile image mode...,8080808080c08e88


### Now use average hash instead

In [8]:
def get_image_avg_hash(media_keys: list, media_dir="../../../../data/media"):
    if not os.path.exists(media_dir):
        raise FileNotFoundError(f"The directory '{media_dir}' does not exist.")
    if not isinstance(media_keys, list):
        media_keys = eval(media_keys)
    possible_extensions = [".jpg", ".png"]

    key = media_keys[0]
    found = False
    for ext in possible_extensions:
        image_path = os.path.join(media_dir, f"{key}{ext}")
        if os.path.exists(image_path):
            try:
                img = Image.open(image_path)
                phash = imagehash.average_hash(img)
                found = True
                return phash
            except Exception as e:
                print(f"[!] Failed to open {image_path}: {e}")
                return None
    if not found:
        print(f"[*] No image file found for key '{key}' in directory '{media_dir}'.")
        return None

df["avghash"] = df["media_keys"].apply(get_image_avg_hash)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15534 entries, 0 to 15533
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    15534 non-null  int64 
 1   account_id    15534 non-null  int64 
 2   image_id      15534 non-null  object
 3   account_type  15534 non-null  object
 4   lang          15408 non-null  object
 5   stance        15337 non-null  object
 6   media_keys    15534 non-null  object
 7   images        15534 non-null  object
 8   phash         15534 non-null  object
 9   avghash       15534 non-null  object
dtypes: int64(2), object(8)
memory usage: 1.2+ MB


In [34]:
# temp = df["avghash"].sample(1000)
# temp = temp.apply(lambda x: x - df.sample(1)["avghash"].item())
