In [2]:
import ast

import pandas as pd
import os
from sklearn.linear_model import LinearRegression
from shapely.geometry import LineString
import geopandas as gpd
import geopy.distance
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import shapely.wkt
import matplotlib as mpl
from matplotlib.ticker import ScalarFormatter
import matplotlib.colors as mplc
from mpl_toolkits.axes_grid1 import make_axes_locatable

#import shap
from matplotlib import gridspec
import matplotlib.lines as mlines

import scipy.stats
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import summary_table
from adjustText import adjust_text

from pathlib import Path

STEPS:

1) collect 30000 tweets per country (twitter_api.py)
2) put tweets into a dataset (tweets_dataset.py)
3) find locations from the text using Nominatim (tweets_flux.py)
4) Find self-declared locations using Nominatim and compare them with Place locations (main geocoding)d
5) create matrices of fluxes - total number and percentage of mentions from one country to another

In [3]:
cwd = Path.cwd()
parent_dir = cwd.parent.parent

parent_dir

PosixPath('/home/veror/Desktop/Tweet2Geo/NEW Tweet2Geo')

## Read data

In [6]:
# merge iso2 - iso3

AREA = 'EU'
# geodata dataset
gdf = gpd.read_file(parent_dir / 'Data' / 'ne_50m_admin_0_countries.shp')
# Molloweide projection
gdf = gdf.to_crs("ESRI:54009")
# iso2 - iso3 dataset
iso_2_3_df = pd.read_csv(parent_dir / 'Data' / 'iso2-3 europe.csv')
# merge
df_country = pd.merge(gdf, iso_2_3_df, on='ADM0_A3', how='right')
#df_country.to_csv('Nations tweets/info_country.csv', index=False)


### Out-flows

In [None]:
import os
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import matplotlib.lines as mlines

# Read data frame of flows and convert it into GeoDataFrame
flows_dir = parent_dir / 'Data' / 'Europe' / '0 fluxes 0.5'
flows_files = os.listdir(flows_dir)

legend_fontsize = 18

# Save directory
save_dir = parent_dir / 'Data analysis' / 'Choroplets' / 'EU' / 'out_flows_0.5'
colorsource = 'firebrick'
n = 0
for file in flows_files:
    country = file[:2]

    if len(df_country[df_country['ISO_A2'] ==country]) > 0:
        df_flow = pd.read_csv(flows_dir / file)
        df_flow = df_flow[~(df_flow['ISO 2'] == country)]

        print(country)

        # Prepare data
        df_source = df_country[df_country['ISO_A2'] == country]
        df_flow = pd.merge(df_country, df_flow, on='ISO 2', how='left')

        # Convert DataFrame to GeoDataFrame
        df_flow = gpd.GeoDataFrame(df_flow, geometry='geometry', crs='EPSG:3857')
        df_source = gpd.GeoDataFrame(df_source, geometry='geometry', crs='EPSG:3857')

        # Create figure with custom GridSpec
        fig = plt.figure(figsize=(7, 6), dpi=250)
        gs = gridspec.GridSpec(1, 2, width_ratios=[0.95, 0.05], wspace=0.04)

        # Axes for main map and colorbar
        ax = fig.add_subplot(gs[0])
        cax = fig.add_subplot(gs[1])

        # Plot map
        df_flow.plot(column='flux_percentage',
                ax=ax,
                cmap='Blues',
                legend=True,
                missing_kwds={'color': 'lightgrey'},
                legend_kwds={'label': ''},  # Leave empty; we format cbar manually
                cax=cax,
                rasterized=True)

        df_flow.boundary.plot(color='black', ax=ax, linewidth=0.5)

        # Plot source country
        df_source.plot(color=colorsource, alpha=0.5, ax=ax)
        df_source.boundary.plot(color=colorsource, linewidth=0.5, ax=ax)

        # Add manual legend for source
        legend_handles = [mlines.Line2D([0], [0], color=colorsource, lw=4,
                                    label=f'Source: {df_source.ADMIN.values[0]}')]
        ax.legend(handles=legend_handles, loc='lower left', fontsize=12, frameon=False)

        # Axis settings
        ax.set(xlim=(-1500000, 3500000), ylim=(4000000, 9000000))
        ax.axis('off')

        # Format colorbar
        cax.set_ylabel('Tweets out-flows percentage [%]', rotation=90, fontsize=18)
        cax.tick_params(labelsize=16)
        cax.yaxis.set_ticks_position('right')
        cax.yaxis.set_label_position('right')
        cax.set_xticks([])  # Optional: remove x ticks if unwanted

        # Save
        file = country + '_choroplet.pdf'
        fig.savefig(save_dir / file, dpi=250, bbox_inches='tight')
        plt.close()

    n += 1


CZ
AD
DE
LT
LI
IE
ME
CH
DK
SE
EE
RS
IT
BG
AL
BY
SI
TR
UA
AT
FI
HU
BE
MD
PL
IS
NL
LV
BA
LU
VA
HR
RO
MK
RU
GR
SM
MT
SK
GB
MC
ES
PT


### In-flows

In [15]:
import os
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.colors as mplc
from matplotlib import gridspec

# Read data frame of flows and convert it into GeoDataFrame
flows_dir = parent_dir / 'Data' / 'Europe' / '0 fluxes 0.5'
flows_files = os.listdir(flows_dir)

# Save directory
save_dir = parent_dir / 'Data analysis' / 'Choroplets' / 'EU' / 'in_flows_0.5'
colorsource = 'firebrick'
legend_fontsize = 18


for file in flows_files:
    df_in_flows = pd.DataFrame(columns=[])
    df_flow = pd.read_csv(flows_dir / file)
    country = file[0:2]
    print(country)

    if country in ['NO','FR']:
        country = 'BE'

    df_source = df_country[df_country['ISO_A2'] == country]

    for file1 in flows_files:
        country1 = file1[0:2]
        if country1 != country:
            df_flow1 = pd.read_csv(flows_dir / file1)
            df_flow1 = df_flow1[df_flow1['ISO 2'] == country].reset_index()
            df_flow1.at[0, 'ISO 2'] = country1
            df_in_flows = df_in_flows._append(df_flow1, ignore_index=True)

    df_in_flow = pd.merge(df_country, df_in_flows, on='ISO 2', how='left')
    df_in_flow = gpd.GeoDataFrame(df_in_flow, geometry='geometry', crs='EPSG:3857')
    df_source = gpd.GeoDataFrame(df_source, geometry='geometry', crs='EPSG:3857')

    # Plot using gridspec
    fig = plt.figure(figsize=(7, 6), dpi=250)
    gs = gridspec.GridSpec(1, 2, width_ratios=[0.95, 0.05], wspace=0.04)

    ax = fig.add_subplot(gs[0])
    cax = fig.add_subplot(gs[1])

    # Plot flow data
    df_in_flow.plot(column='flux_percentage',
                    ax=ax,
                    cmap='Blues',
                    legend=True,
                    missing_kwds={'color': 'lightgrey'},
                    legend_kwds={'label': ''},  # leave empty, we format cbar manually
                    cax=cax,
                    rasterized=True)
    df_in_flow.boundary.plot(color='black', ax=ax, linewidth=0.5)

    # Plot source country
    df_source.plot(color=colorsource, alpha=0.5, ax=ax)
    df_source.boundary.plot(color=colorsource, linewidth=0.5, ax=ax)

    # Manual legend for source
    legend_handles = [mlines.Line2D([0], [0], color=colorsource, lw=4,
                                     label=f'Target: {df_source.ADMIN.values[0]}')]
    ax.legend(handles=legend_handles, loc='lower left', fontsize=12, frameon=False)

    # Axis & colorbar formatting
    ax.set(xlim=(-1500000, 3500000), ylim=(4000000, 9000000))
    ax.axis('off')

    cax.set_ylabel('Tweets in-flows percentage [%]', rotation=90, fontsize=18)
    cax.tick_params(labelsize=16)

    #cax.yaxis.set_ticks_position('right')
    #cax.yaxis.set_label_position('right')
    #cax.set_xticks([])

    # Save
    file = country + '_choroplet.pdf'
    fig.savefig(save_dir / file, dpi=250, bbox_inches='tight')
    plt.close()


CZ
AD
DE
LT
LI
IE
ME
CH
DK
SE
EE
RS
IT
BG
AL
BY
SI
TR
UA
AT
FI
HU
BE
MD
PL
IS
NL
LV
BA
LU
NO
VA
HR
RO
FR
MK
RU
GR
SM
MT
SK
GB
MC
ES
PT


### Self-loops

In [17]:
import os
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.colors as mplc

# Read data frame of flows and convert it into geodataframe
flows_dir = parent_dir / 'Data' / 'Europe' / '0 fluxes 0.5'
flows_files = os.listdir(flows_dir)

# Save directory
save_dir = parent_dir / 'Data analysis' / 'Choroplets' / 'EU'

# Base country GeoDataFrame (assumed to be previously loaded)
# df_country must be defined beforehand and must include 'ISO 2', 'ISO_A2', 'geometry', 'ADMIN' columns

df_self_loops = pd.DataFrame(columns=[])

for file in flows_files:
    df_flow = pd.read_csv(flows_dir / file)
    country = file[0:2]
    country_row = df_flow[df_flow['ISO 2'] == country]
    df_flow = pd.merge(df_country, country_row, on='ISO 2')
    df_self_loops = pd.concat([df_self_loops, df_flow], ignore_index=True)

# Convert to GeoDataFrame if not already
df_self_loops = gpd.GeoDataFrame(df_self_loops, geometry='geometry', crs='EPSG:3857')

# Plotting settings
legend_fontsize = 18
colorsource = 'firebrick'

vmin = df_self_loops['flux_percentage'].min()
vmax = df_self_loops['flux_percentage'].max()

# Plot using gridspec
fig = plt.figure(figsize=(7, 6), dpi=250)
gs = gridspec.GridSpec(1, 2, width_ratios=[0.95, 0.05], wspace=0.04)

ax = fig.add_subplot(gs[0])
cax = fig.add_subplot(gs[1])

# Adjust cax to be 60% the height of the figure manually
#cax = fig.add_axes([0.85, 0.2, 0.02, 0.6]) # [left, bottom, width, height] in figure coordinates

# Main choropleth map
df_self_loops.plot(
    column='flux_percentage',
    legend=True,
    missing_kwds={'color': 'lightgrey'},
    legend_kwds={'label': 'Tweets self-loop percentage [%]'},
    ax=ax,
    cmap='GnBu',
    vmin=vmin,
    vmax=vmax,
    cax=cax,
    rasterized=True
)

# Add boundaries
df_self_loops.boundary.plot(color='black', ax=ax, linewidth=0.5)

# Set view limits (adjust if necessary)
ax.set(xlim=(-1500000, 3500000), ylim=(4000000, 9000000))

# Format colorbar
cax.set_ylabel('Percentage of self-loops [%]', rotation=90, fontsize=18)
cax.tick_params(labelsize=16)

# Remove axes
ax.axis('off')

# Save figure
fig.savefig(save_dir / 'self_loops_0.5_choroplet.pdf', dpi=250, bbox_inches='tight')
plt.close()
