In [None]:
!pip install mahotas
!pip install pycountry
!pip install countryinfo
!pip install pingouin
#pip install openpyxl

In [None]:

import pandas as pd
import mahotas as mt
import glob
import os
import PIL
from PIL import ImageOps
import numpy as np
import pycountry
from countryinfo import CountryInfo
import pingouin as pg
from IPython.display import Image, display
import seaborn as sns
sns.set()
import pingouin as pg
import plotly.express as px


In [None]:
from PIL import Image 

In [None]:
filenames = glob.glob('*.png')  # list of all .png files in the directory

names = []
images = []

for i in filenames:
    names.append(os.path.basename(i)[:-4])
    img = Image.open(i)
    img = img.convert("RGBA")
    img = ImageOps.grayscale(img)
    img = np.asanyarray(img)
    images.append(img)

In [None]:
len(images[0])

# What are haralick features?

Haralick features are a set of texture descriptors derived from the Gray Level Co-occurrence Matrix (GLCM), which is a method of examining the texture of an image by considering the spatial relationship of pixels. The GLCM is a matrix that represents how often different combinations of pixel brightness values (gray levels) occur in an image. From the GLCM, various statistical measures (Haralick features) can be calculated to describe the texture of the image. There are 14 commonly used Haralick features, each capturing different aspects of texture:

1. Angular Second Moment (ASM): Measures image homogeneity. Higher values indicate more homogeneity or uniformity in the image texture.

2. Contrast: Measures the local variations in the gray-level co-occurrence matrix. Higher contrast values indicate greater disparities in pixel intensities.

3. Correlation: Evaluates the joint probability occurrence of the specified pixel pairs. High correlation indicates a predictable relationship between pixel values.

4. Sum of Squares: Variance: Reflects the variance of the image intensities. It's a measure of the spread or dispersion of pixel values.

5. Inverse Difference Moment (IDM): Also known as Homogeneity. It's high when the image has less contrast, indicating more homogeneity.

6. Sum Average: The average value of the sum of gray levels of pixel pairs. It's a measure of the overall brightness.

7. Sum Variance: Measures the variance of the sum of the GLCM. It assesses the variance in the sum average.

8. Sum Entropy: Measures the randomness or complexity in the sum of gray levels. Higher values indicate more complexity.

9. Entropy: Quantifies the disorder or complexity of the image. Higher entropy values imply more complex texture patterns.

10. Difference Variance: Measures the variance in the difference between the gray levels of the pixel pairs.

11. Difference Entropy: Measures the complexity or randomness of the differences between the gray levels of the pixel pairs.

12. Information Measures of Correlation I & II: These two features provide information about the complexity of the image texture as seen in the GLCM. They measure how correlated a pixel is to its neighbor over the whole image.

13. Maximal Correlation Coefficient (MCC): This measures the correlation between the probabilities of the pixel pairs. It requires eigenvalue calculations and is often more computationally intensive.

# Compute haralick features and create dataframe

In [None]:
haralick = [mt.features.haralick(i, return_mean = True, compute_14th_feature=True) for i in images]


features = ['angular_2nd_momentum', 'contrast', 'correlation', 'SS_variance', \
            'Inverse_diff_moment', 'sum_average', 'sum_variance', 'sum_entropy', \
            'entropy','difference_variation', 'difference_entropy', 'info_corr_1', \
            'info_corr_2', 'max_corr_coeff']

h_df = pd.DataFrame(haralick, columns = features)
h_df['short_names'] = names

In [None]:
h_df

# Get full country names

In [None]:
full_name = []

for i in h_df['short_names']:
    try:
        full_name.append(pycountry.countries.get(alpha_2=i).name)
    except:
        full_name.append(np.nan)
        
h_df['full_name'] = full_name

In [None]:
h_df

# Get number of borders

In [None]:
borders = []


for i in h_df['full_name']:
    try:
        a = CountryInfo(i)
        borders.append(len(a.borders()))
    except:
        borders.append(np.nan)

h_df['borders'] = borders

In [None]:
h_df

## Get data on ethnic diversity

In [None]:
ethnic = pd.read_csv('ethnic_fractions.csv')

In [None]:
h_df = pd.merge(h_df, ethnic, on='full_name', how='left')


In [None]:
data = h_df[['ethnic fractionalization', 'borders', 'entropy', 'contrast', 'full_name']]

In [None]:
px.scatter(h_df, x = 'ethnic fractionalization', y = 'entropy', hover_data = ['full_name'], trendline = 'ols')

In [None]:
lm = pg.linear_regression(h_df['ethnic fractionalization'], h_df['entropy'], remove_na = True)

In [None]:
lm

In [None]:
px.scatter(h_df, x = 'borders', y = 'contrast', hover_data = ['full_name', 'borders'], trendline = 'ols')

In [None]:
lm = pg.linear_regression(h_df['borders'], h_df['contrast'], remove_na = True)

In [None]:
lm

In [None]:
mid_e = (data['entropy'].min() + data['entropy'].max()) / 2
mid_c = (data['contrast'].min() + data['contrast'].max()) / 2

data['entropy'] = data['entropy'] - mid_e
data['contrast'] = data['contrast'] - mid_c

In [None]:
fig = px.scatter(data, x="entropy", y="contrast", hover_data = ['full_name', 'borders'])
fig.show()