# Aufbereitung der Flüchtlingsdaten und Berechnung normierter Kennzahlen

Ziel dieses Notebooks ist es, aus den verfügbaren Flüchtlingszeitreihen eine vergleichbare Kennzahl für die Analyse von Flüchtlingsbewegungen zu konstruieren. Konkret wird der Flüchtlingsanteil pro 1.000 Einwohner berechnet, getrennt nach Herkunfts und Zielländern sowie für jedes Land und jedes Jahr.

Die ursprünglichen Zeitreihendaten enthalten ausschliesslich absolute Flüchtlingszahlen. Diese sind jedoch nur eingeschränkt vergleichbar, da Länder sich stark in ihrer Bevölkerungsgrösse unterscheiden. Um belastbare Vergleiche zwischen Ländern sowie über die Zeit hinweg zu ermöglichen, werden die absoluten Flüchtlingszahlen daher mit den jeweiligen Bevölkerungszahlen ins Verhältnis gesetzt.

Hierfür werden zusätzlich Bevölkerungsdaten der Weltbank verwendet. Da die Zeitreihendaten und die Bevölkerungsdaten unterschiedliche Namenskonventionen für Länder verwenden, erfolgt zunächst ein systematisches Mapping der Ländernamen zwischen beiden Datensätzen. Dieses Mapping stellt sicher, dass die Daten korrekt zusammengeführt werden können.

Anschliessend werden die Flüchtlingszeitreihen getrennt nach Herkunfts und Zielländern aggregiert. Für jedes Land und jedes Jahr werden die relevanten Flüchtlingszahlen zusammengefasst. In einem nächsten Schritt werden diese aggregierten Flüchtlingszahlen mit den entsprechenden Bevölkerungsdaten verknüpft, um den Flüchtlingsanteil pro 1.000 Einwohner zu berechnen.

Dieses Notebook dient somit der sauberen Datenaufbereitung und der Konstruktion der zentralen Kennzahl, auf der weitere Analysen aufbauen.

In [91]:
import pandas as pd

# extrahiert alle Ländernamen aus Time_Series und API_SP, dem Bevölkerungsdatensatz 
def get_country_names(ts_path, pop_path):


    ts = pd.read_csv(ts_path)
    pop = pd.read_csv(pop_path, skiprows=3)
    
    origins = ts["Origin"].dropna().tolist()
    destinations = ts["Country / territory of asylum/residence"].dropna().tolist()
    
    ts_countries = sorted(set(origins) | set(destinations))
    pop_countries = sorted(set(pop["Country Name"].dropna().tolist()))
    
    return ts_countries, pop_countries


ts_countries, pop_countries = get_country_names("archive/time_series.csv", "additional_data/API_SP.POP.TOTL_DS2_en_csv_v2_280659.csv")

print("Time-series countries:", len(ts_countries))
print(ts_countries)
print()
print("Population countries:", len(pop_countries))
print(pop_countries)

Time-series countries: 226
['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia (Plurinational State of)', 'Bonaire', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'British Virgin Islands', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Cayman Islands', 'Central African Rep.', 'Chad', 'Chile', 'China', 'China, Hong Kong SAR', 'China, Macao SAR', 'Colombia', 'Comoros', 'Congo', 'Cook Islands', 'Costa Rica', 'Croatia', 'Cuba', 'Curaçao', 'Cyprus', 'Czech Rep.', "Côte d'Ivoire", "Dem. People's Rep. of Korea", 'Dem. Rep. of the Congo', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Rep.', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Fiji'

  ts = pd.read_csv(ts_path)


In [92]:
def build_country_mapping(ts_countries, pop_countries):
    mapping = {}

    pop_set = set(pop_countries)

    # direkte matches
    for c in ts_countries:
        if c in pop_set:
            mapping[c] = c
        else:
            mapping[c] = None  

    # manuelle matches
    manual = {
        "Bolivia (Plurinational State of)": "Bolivia",
        "Central African Rep.": "Central African Republic",
        "Congo": "Congo, Rep.",
        "Dem. Rep. of the Congo": "Congo, Dem. Rep.",
        "Côte d'Ivoire": "Cote d'Ivoire",
        "Czech Rep.": "Czechia",
        "China, Hong Kong SAR": "Hong Kong SAR, China",
        "China, Macao SAR": "Macao SAR, China",
        "Dominican Rep.": "Dominican Republic",
        "Egypt": "Egypt, Arab Rep.",
        "Gambia": "Gambia, The",
        "Holy See (the)": None,
        "Iran (Islamic Rep. of)": "Iran, Islamic Rep.",
        "Korea, Dem. People's Rep.": None,
        "Dem. People's Rep. of Korea": "Korea, Dem. People's Rep.",
        "Rep. of Korea": "Korea, Rep.",
        "State of Palestine": "West Bank and Gaza",
        "Serbia and Kosovo (S/RES/1244 (1999))": "Serbia",
        "Syrian Arab Rep.": "Syrian Arab Republic",
        "United Rep. of Tanzania": "Tanzania",
        "United States of America": "United States",
        "Venezuela (Bolivarian Republic of)": "Venezuela, RB",
        "Viet Nam": "Viet Nam",
        "The former Yugoslav Republic of Macedonia": "North Macedonia",
        "Swaziland": "Eswatini",
        "Sao Tome and Principe": "Sao Tome and Principe",
        "Timor-Leste": "Timor-Leste",
        "Turkey": "Turkiye",
        "Lao People's Dem. Rep.": "Lao PDR",
        "Micronesia (Federated States of)": "Micronesia, Fed. Sts.",
        "Curacao": "Curacao",
        "Curaçao": "Curacao",
        "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)",
        "Saint Kitts and Nevis": "St. Kitts and Nevis",
        "Saint Lucia": "St. Lucia",
        "Somalia":"Somalia, Fed. Rep.",
        "Saint Vincent and the Grenadines": "St. Vincent and the Grenadines",

        # Länder ohne passende Bevölkerungsdaten
        "Svalbard and Jan Mayen": None,
        "Various/Unknown": None,
        "Stateless": None,
        "Tibetan": None,
        "Western Sahara": None,
    }

   # Manuelle Zuordnung anwenden
    for ts_name, pop_name in manual.items():
        if ts_name in mapping:
            mapping[ts_name] = pop_name

    return mapping

In [93]:
mapping = build_country_mapping(ts_countries, pop_countries)
matched = {k: v for k, v in mapping.items() if v is not None}
print("matched",len(matched))
for k, v in matched.items():
    print(k, v)



missing = {k: v for k, v in mapping.items() if v is None}
print("Unmatched:", len(missing))
missing

matched 202
Afghanistan Afghanistan
Albania Albania
Algeria Algeria
American Samoa American Samoa
Andorra Andorra
Angola Angola
Antigua and Barbuda Antigua and Barbuda
Argentina Argentina
Armenia Armenia
Aruba Aruba
Australia Australia
Austria Austria
Azerbaijan Azerbaijan
Bahrain Bahrain
Bangladesh Bangladesh
Barbados Barbados
Belarus Belarus
Belgium Belgium
Belize Belize
Benin Benin
Bermuda Bermuda
Bhutan Bhutan
Bolivia (Plurinational State of) Bolivia
Bosnia and Herzegovina Bosnia and Herzegovina
Botswana Botswana
Brazil Brazil
British Virgin Islands British Virgin Islands
Brunei Darussalam Brunei Darussalam
Bulgaria Bulgaria
Burkina Faso Burkina Faso
Burundi Burundi
Cabo Verde Cabo Verde
Cambodia Cambodia
Cameroon Cameroon
Canada Canada
Cayman Islands Cayman Islands
Central African Rep. Central African Republic
Chad Chad
Chile Chile
China China
China, Hong Kong SAR Hong Kong SAR, China
China, Macao SAR Macao SAR, China
Colombia Colombia
Comoros Comoros
Congo Congo, Rep.
Costa Rica 

{'Anguilla': None,
 'Bahamas': None,
 'Bonaire': None,
 'Cook Islands': None,
 'French Guiana': None,
 'Guadeloupe': None,
 'Holy See (the)': None,
 'Kyrgyzstan': None,
 'Martinique': None,
 'Montserrat': None,
 'Niue': None,
 'Norfolk Island': None,
 'Palestinian': None,
 'Puerto Rico': None,
 'Rep. of Moldova': None,
 'Saint-Pierre-et-Miquelon': None,
 'Slovakia': None,
 'Stateless': None,
 'Svalbard and Jan Mayen': None,
 'Tibetan': None,
 'Various/Unknown': None,
 'Wallis and Futuna Islands ': None,
 'Western Sahara': None,
 'Yemen': None}

In [94]:

# zählt die Flüchtllingszahlen für jedes Land getrennt nach Herkunft und Destination Land für jedes jahr zusammen und speichert das Ergebnis
def process_refugee_flows(ts_path):
   

    ts = pd.read_csv(ts_path)

    # Relevante Populationstypen (haben uns auf Refugees und Asylum-seekers beschränkt)
    types_to_keep = [
        "Refugees (incl. refugee-like situations)",
        "Asylum-seekers"
    ]

    ts = ts[ts["Population type"].isin(types_to_keep)].copy()

    # Werte in numerisches Format überführen
    ts["Value"] = pd.to_numeric(ts["Value"], errors="coerce")
    ts = ts.dropna(subset=["Value"])

  
    # Zielländer (Destination)
    dest_flow = (
        ts.groupby(["Country / territory of asylum/residence", "Year"], as_index=False)["Value"]
        .sum()
        .rename(columns={"Country / territory of asylum/residence": "Destination"})
    )

    dest_flow.to_csv("output_csv_files/Destination_country_refugee_flow.csv", index=False)

  
    # Herkunftsländer (Origin)
    origin_flow = (
        ts.groupby(["Origin", "Year"], as_index=False)["Value"]
        .sum()
    )

    origin_flow.to_csv("output_csv_files/Origin_country_refugee_flow.csv", index=False)

    return dest_flow, origin_flow

In [95]:
process_refugee_flows("archive/time_series.csv")

  ts = pd.read_csv(ts_path)


(      Destination  Year    Value
 0     Afghanistan  1990     50.0
 1     Afghanistan  1991     38.0
 2     Afghanistan  1992  60025.0
 3     Afghanistan  1993  32132.0
 4     Afghanistan  1994  19131.0
 ...           ...   ...      ...
 6192     Zimbabwe  2012   4792.0
 6193     Zimbabwe  2013   6869.0
 6194     Zimbabwe  2014   6720.0
 6195     Zimbabwe  2015   7209.0
 6196     Zimbabwe  2016   8366.0
 
 [6197 rows x 3 columns],
            Origin  Year      Value
 0     Afghanistan  1979   500000.0
 1     Afghanistan  1980  1734921.0
 2     Afghanistan  1981  3879984.0
 3     Afghanistan  1982  4488214.0
 4     Afghanistan  1983  4712735.0
 ...           ...   ...        ...
 6291     Zimbabwe  2012    63831.0
 6292     Zimbabwe  2013    61572.0
 6293     Zimbabwe  2014    64926.0
 6294     Zimbabwe  2015    78735.0
 6295     Zimbabwe  2016    61291.0
 
 [6296 rows x 3 columns])

In [96]:

#Berechnet den Flüchtlingsanteil pro 1.000 Einwohner für Herkunfts und Zielländer.
    
def compute_refugees_per_population(
    dest_flow_path,
    origin_flow_path,
    pop_path,
    mapping
):
    # Daten laden
    dest = pd.read_csv(dest_flow_path)
    origin = pd.read_csv(origin_flow_path)
    pop = pd.read_csv(pop_path, skiprows=3)

     # Bevölkerungsdaten in Long-Format bringen
    pop_long = pop.melt(
        id_vars=["Country Name", "Country Code"],
        var_name="Year",
        value_name="Population"
    )

    pop_long["Population"] = pd.to_numeric(pop_long["Population"], errors="coerce")
    pop_long["Year"] = pd.to_numeric(pop_long["Year"], errors="coerce")

   # Hilfsfunktion zur Anwendung des Länder Mappings
    def map_country(name):
        return mapping.get(name, None)


    # Zielländer
    dest["Mapped_Name"] = dest["Destination"].apply(map_country)
    dest = dest.dropna(subset=["Mapped_Name"])

    dest_merged = dest.merge(
        pop_long,
        left_on=["Mapped_Name", "Year"],
        right_on=["Country Name", "Year"],
        how="left"
    )

    dest_merged["refugee_share"] = dest_merged["Value"] / dest_merged["Population"]
    dest_merged["share_per_1000"] = dest_merged["refugee_share"] * 1000

    dest_final = dest_merged[[
        "Destination",
        "Year",
        "Value",
        "Country Code",
        "Population",
        "refugee_share",
        "share_per_1000"
    ]].rename(columns={
        "Destination": "country",
        "Value": "Refugee_amount",
        "Country Code": "country_code"
    })

    dest_final.to_csv("output_csv_files/Destination_refugees_per_capita.csv", index=False)


   
    # Herkunftsländer
    origin["Mapped_Name"] = origin["Origin"].apply(map_country)
    origin = origin.dropna(subset=["Mapped_Name"])

    origin_merged = origin.merge(
        pop_long,
        left_on=["Mapped_Name", "Year"],
        right_on=["Country Name", "Year"],
        how="left"
    )

    origin_merged["refugee_share"] = origin_merged["Value"] / origin_merged["Population"]
    origin_merged["share_per_1000"] = origin_merged["refugee_share"] * 1000

    origin_final = origin_merged[[
        "Origin",
        "Year",
        "Value",
        "Country Code",
        "Population",
        "refugee_share",
        "share_per_1000"
    ]].rename(columns={
        "Origin": "country",
        "Value": "Refugee_amount",
        "Country Code": "country_code"
    })

    origin_final.to_csv("output_csv_files/Origin_refugees_per_capita.csv", index=False)

    return dest_final, origin_final

In [97]:
dest_capita, orig_capita = compute_refugees_per_population(
    "output_csv_files/Destination_country_refugee_flow.csv",
    "output_csv_files/Origin_country_refugee_flow.csv",
    "additional_data/API_SP.POP.TOTL_DS2_en_csv_v2_280659.csv",
    matched
)