# モジュール読み込み

In [77]:
import pandas as pd
import networkx as nx
from tqdm import tqdm
import numpy as np

# データ読み込み

## 読み込み

In [3]:
df_presentations = pd.read_csv('presentations_vectors.csv') #List of presentations
df_attendees = pd.read_csv('attendees.csv') #List of attendees
df_connections = pd.read_csv('connections.csv') #List of connection made on the app
df_author_relations = pd.read_csv('author_relations.csv') #Table of relationships connecting attendees to presentations

## 国コードの定義・言語と国の対応関係・分野・参加登録様式

In [5]:
# Convert country to country code
country_code_dict = {
    'Nepal': 'NP',
    'Pakistan': 'PK',
    'Canada': 'CA',
    'Netherlands': 'NL',
    'Sweden': 'SE',
    'Serbia': 'RS',
    'Czech Republic': 'CZ',
    'New Zealand': 'NZ',
    'Laos': 'LA',
    'Finland': 'FI',
    'Taiwan': 'TW',
    'Poland': 'PL',
    'Turkey': 'TR',
    'Colombia': 'CO',
    'Estonia': 'EE',
    'Korea': 'KR',
    'Bangladesh': 'BD',
    'Thailand': 'TH',
    'Romania': 'RO',
    'Germany': 'DE',
    'Japan': 'JP',
    'Singapore': 'SG',
    'Sri Lanka': 'LK',
    'India': 'IN',
    'Israel': 'IL',
    'Indonesia': 'ID',
    'Palestine': 'PS',
    'South Africa': 'ZA',
    'Hungary': 'HU',
    'Argentine': 'AR',
    'Ireland': 'IE',
    'Ghana': 'GH',
    'Latvia': 'LV',
    'Slovenia': 'SI',
    'Myanmar': 'MM',
    'France': 'FR',
    'Belgium': 'BE',
    'Switzerland': 'CH',
    'Slovakia': 'SK',
    'Brazil': 'BR',
    'Russia': 'RU',
    'Norway': 'NO',
    'UK': 'GB',
    'Malaysia': 'MY',
    'Philippines': 'PH',
    'China': 'CN',
    'Portugal': 'PT',
    'Denmark': 'DK',
    'Chile': 'CL',
    'Mexico': 'MX',
    'Uruguay': 'UY',
    'Australia': 'AU',
    'Italy': 'IT',
    'Greece': 'GR',
    'Hong Kong': 'HK',
    'Zambia': 'ZM',
    'Iran': 'IR',
    'Kenya': 'KE',
    'Croatia': 'HR',
    'United Arab Emirates': 'AE',
    'Puerto Rico': 'PR',
    'Spain': 'ES',
    'Austria': 'AT',
    'United States of America': 'US',
    'Réunion': 'RE'
}

#Countries and their official languages
country_language_dict = {
    'NP': ['Nepali'],
    'PK': ['Urdu', 'English'],
    'CA': ['English', 'French'],
    'NL': ['Dutch'],
    'SE': ['Swedish'],
    'RS': ['Serbian'],
    'CZ': ['Czech'],
    'NZ': ['English', 'Maori'],
    'LA': ['Lao'],
    'FI': ['Finnish', 'Swedish'],
    'TW': ['Mandarin Chinese'],
    'PL': ['Polish'],
    'TR': ['Turkish'],
    'CO': ['Spanish'],
    'EE': ['Estonian'],
    'KR': ['Korean'],
    'BD': ['Bengali'],
    'TH': ['Thai'],
    'RO': ['Romanian'],
    'DE': ['German'],
    'JP': ['Japanese'],
    'SG': ['English', 'Malay', 'Mandarin Chinese', 'Tamil'],
    'LK': ['Sinhala', 'Tamil'],
    'IN': ['Hindi', 'English'],
    'IL': ['Hebrew', 'Arabic'],
    'ID': ['Indonesian'],
    'PS': ['Arabic'],
    'ZA': ['English', 'Afrikaans', 'Zulu', 'Xhosa', 'Southern Sotho'],
    'HU': ['Hungarian'],
    'AR': ['Spanish'],
    'IE': ['English', 'Irish'],
    'GH': ['English'],
    'LV': ['Latvian'],
    'SI': ['Slovene'],
    'MM': ['Burmese'],
    'FR': ['French'],
    'BE': ['Dutch', 'French', 'German'],
    'CH': ['German', 'French', 'Italian', 'Romansh'],
    'SK': ['Slovak'],
    'BR': ['Portuguese'],
    'RU': ['Russian'],
    'NO': ['Norwegian'],
    'GB': ['English'],
    'MY': ['Malay', 'English', 'Mandarin Chinese', 'Tamil'],
    'PH': ['Filipino', 'English'],
    'CN': ['Mandarin Chinese'],
    'PT': ['Portuguese'],
    'DK': ['Danish'],
    'CL': ['Spanish'],
    'MX': ['Spanish'],
    'UY': ['Spanish'],
    'AU': ['English'],
    'IT': ['Italian'],
    'GR': ['Greek'],
    'HK': ['Cantonese', 'English'],
    'ZM': ['English'],
    'IR': ['Persian'],
    'KE': ['English', 'Swahili'],
    'HR': ['Croatian'],
    'AE': ['Arabic'],
    'PR': ['Spanish', 'English'],
    'ES': ['Spanish'],
    'AT': ['German'],
    'US': ['English', 'Spanish'],
    'RE': ['French']
}

# Name of fields from the number code
field_name_dict = {1: "Acarology and Arachnology",
          2: "Apiculture and Sericulture",
          3: "Biological Control",
          4: "Chemical Ecology",
          5: "Conservation, Biodiversity and Biogeography",
          6: "Development and Reproduction",
          7: "Ecology and Evolution",
          8: "Genetics and Genomics",
          9: "Immunology and Pathology",
          10: "Insect-Microbe Interactions",
          11: "Insects as Food, Feed and Pollinators",
          12: "Alien insects",
          13: "Medical and Veterinary Entomology",
          14: "Pest Management",
          15: "Pesticides, GM Crops, Resistance and Toxicology",
          16: "Physiology, Neurobiology and Molecular Biology",
          17: "Social Insects",
          18: "Systematics, Phylogeny and Morphology",
          19: "Special Issue: Biomimetics and Robotics",
          20: "Others"
          }

# Convert attend_as into categories
attend_as_dict = {"General": "General",
                  "Student": "Student",
                  "Developing country (*上記注釈をご参照ください)": "Developing country",
                  "Accompanying guest": "Accompanying guest",
                  "Guest (*このカテゴリは選択しないでください)": "Guest",
                  "(On-Site)General": "General",
                  "(On-Site)Students": "Student",
                  "(On-Site)Accompaying guest": "Accompanying guest",
                  "One Day Pass *1日券を購入する場合、この項目にチェックを入れたうえで次の質問で来場日を選択してください。": "One_day"
                  }

## データの結合

In [6]:
# convert name of country into country codes
df_attendees["country"] = df_attendees["country"].map(country_code_dict)
# convert attend_as
df_attendees["attend_as"] = df_attendees["attend_as"].map(attend_as_dict)
#convert fields
df_attendees["field_name"] = df_attendees["field"].map(field_name_dict)

# select columns
df_attendees = df_attendees[["attendee_id", "country", "attend_as", "field", "field_name"]]

In [8]:
# Remove duplicates in connections
## Thre are several duplicates of same pair of connections due to error on the app server. Here, these duplicates are removed.
df_connections = df_connections.drop_duplicates("connection_id")

# Select columns
df_connections = df_connections[["connection_id", "from_person", "to_person", "timestamp", "forward"]]

#Convert date type
df_connections["timestamp_datetime"] = df_connections.timestamp.astype("datetime64[ns]")

# Join attendee attributes
df_connections = pd.merge(df_connections, df_attendees.rename(columns=lambda x: f"from_{x}"), left_on="from_person", right_on="from_attendee_id", how="left")
df_connections = pd.merge(df_connections, df_attendees.rename(columns=lambda x: f"to_{x}"), left_on="to_person", right_on="to_attendee_id", how="left")
df_connections.drop(["from_attendee_id", "to_attendee_id"], axis=1, inplace=True)

df_connections.head()

Unnamed: 0,connection_id,from_person,to_person,timestamp,forward,timestamp_datetime,from_country,from_attend_as,from_field,from_field_name,to_country,to_attend_as,to_field,to_field_name
0,A01313_A01451,A01313,A01451,2024-08-09 18:04:17.499,True,2024-08-09 18:04:17.499,JP,General,7.0,Ecology and Evolution,JP,Student,7.0,Ecology and Evolution
1,A01451_A01313,A01451,A01313,2024-08-09 18:04:17.499,False,2024-08-09 18:04:17.499,JP,Student,7.0,Ecology and Evolution,JP,General,7.0,Ecology and Evolution
2,A00777_A01306,A00777,A01306,2024-08-16 15:20:09.809,True,2024-08-16 15:20:09.809,JP,General,14.0,Pest Management,JP,General,14.0,Pest Management
3,A01306_A00777,A01306,A00777,2024-08-16 15:20:09.809,False,2024-08-16 15:20:09.809,JP,General,14.0,Pest Management,JP,General,14.0,Pest Management
4,A01385_A00867,A01385,A00867,2024-08-16 18:37:31.096,True,2024-08-16 18:37:31.096,TW,Student,18.0,"Systematics, Phylogeny and Morphology",TW,General,18.0,"Systematics, Phylogeny and Morphology"


## networkxグラフを作成

In [45]:
df_connections_for_graph = df_connections[df_connections.forward].copy()

# Set start and end to make the graph dynamic
df_connections_for_graph["start"] = df_connections_for_graph.timestamp_datetime.dt.strftime('%Y-%m-%dT%H:%M:%S')
df_connections_for_graph["end"] = "2024-09-05T20:00:00"

# create graph
G = nx.from_pandas_edgelist(df_connections_for_graph, source="from_person", target="to_person", edge_attr=["start", "end"], edge_key="connection_id")

# set node(person) attributes
for _, row in df_attendees.iterrows():
  node_id = row["attendee_id"]
  if node_id in G.nodes:
    node_attrs = {"country": row["country"],
                  "field": row["field"],
                  "field_name": row["field_name"],
                  "attend_as": row["attend_as"]
                  }
    G.nodes[node_id].update(node_attrs)

nx.write_gexf(G, "whole_graph.gexf")

In [40]:
G.number_of_nodes()

2216

# Maslov–Sneppenを実行する

In [57]:
def extract_edges(G: nx.Graph) -> pd.DataFrame:
  edge_data = []

  for u, v, d in G.edges(data=True):
    # ノード属性を取得（存在しない場合はNone）
    u_attr = G.nodes[u]
    v_attr = G.nodes[v]
    edge_info = {
      "source": u,
      "target": v,
      "from_country": u_attr.get("country"),
      "from_field": u_attr.get("field"),
      "from_field_name": u_attr.get("field_name"),
      "from_attend_as": u_attr.get("attend_as"),
      "to_country": v_attr.get("country"),
      "to_field": v_attr.get("field"),
      "to_field_name": v_attr.get("field_name"),
      "to_attend_as": v_attr.get("attend_as"),
    }
    # エッジ属性も追加（必要に応じて）
    edge_info.update(d)
    edge_data.append(edge_info)
  return pd.DataFrame(edge_data)

def add_reversed_edges(df: pd.DataFrame) -> pd.DataFrame:
    reversed_df = df.rename(columns={
        "source": "target",
        "target": "source",
        "from_country": "to_country",
        "to_country": "from_country",
        "from_field": "to_field",
        "to_field": "from_field",
        "from_field_name": "to_field_name",
        "to_field_name": "from_field_name",
        "from_attend_as": "to_attend_as",
        "to_attend_as": "from_attend_as"
    }).copy()
    # 元のdfと逆向きdfを結合
    combined_df = pd.concat([df, reversed_df], ignore_index=True)
    return combined_df

In [123]:
df_connections.loc[df_connections.to_field_name.isna(), "to_field_name"] = "Unselected"
df_connections.loc[df_connections.from_field_name.isna(), "from_field_name"] = "Unselected"

field_matrix_samples = []
country_matrix_samples = []
for i in tqdm(range(10000)):
  G_shuffled = nx.double_edge_swap(G.copy(), nswap=75000, max_tries=750000)
  df_edges = extract_edges(G_shuffled)
  df_edges = add_reversed_edges(df_edges)
  df_edges.loc[df_edges.to_field_name.isna(), "to_field_name"] = "Unselected"
  df_edges.loc[df_edges.from_field_name.isna(), "from_field_name"] = "Unselected"
  field_matrix = df_edges.pivot_table(index="from_field_name", columns="to_field_name", values="source", aggfunc="count").fillna(0)
  country_matrix = df_edges.pivot_table(index="from_country", columns="to_country", values="source", aggfunc="count").fillna(0)
  field_matrix_samples.append(field_matrix)
  country_matrix_samples.append(country_matrix)

field_matrix_samples = np.array(field_matrix_samples)
country_matrix_samples = np.array(country_matrix_samples)

#サンプルを保存
np.save("field_matrix_samples.npy", field_matrix_samples)
np.save("country_matrix_samples.npy", country_matrix_samples)

# 各サンプルの平均と標準偏差
field_matrix_mean = np.mean(field_matrix_samples, axis=0)
field_matrix_std = np.std(field_matrix_samples, axis=0)
field_matrix_mean = pd.DataFrame(field_matrix_mean, index=field_matrix.index, columns=field_matrix.columns)
field_matrix_std = pd.DataFrame(field_matrix_std, index=field_matrix.index, columns=field_matrix.columns)

country_matrix_mean = np.mean(country_matrix_samples, axis=0)
country_matrix_std = np.std(country_matrix_samples, axis=0)
country_matrix_mean = pd.DataFrame(country_matrix_mean, index=country_matrix.index, columns=country_matrix.columns)
country_matrix_std = pd.DataFrame(country_matrix_std, index=country_matrix.index, columns=country_matrix.columns)

100%|██████████| 10000/10000 [4:49:50<00:00,  1.74s/it]


In [125]:
# npyをダウンロード
from google.colab import files
files.download('field_matrix_samples.npy')
files.download('country_matrix_samples.npy')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [130]:
field_matrix_mean.to_csv("sample_mean_field.csv")
field_matrix_std.to_csv("sample_std_field.csv")
country_matrix_mean.to_csv("sample_mean_country.csv")
country_matrix_std.to_csv("sample_std_country.csv")

files.download("sample_mean_field.csv")
files.download("sample_std_field.csv")
files.download("sample_mean_country.csv")
files.download("sample_std_country.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>