In [5]:
import gensim
import pandas as pd
import pathlib

Liste an Ländern, die in allen Modellzeiträumen wenigstens 5 mal auftauchen

In [2]:
country_list = list(pd.read_csv("Country_Min5Freq.csv")["Country"])

## Output der Ländervektoren

Aus den fertig gebildeten Wortvektormodellen werden die Vektoren der einzelnen Ländernamen geladen. Für jede Bootstrapping-Variante jedes Zeitraums wird eine einzelne Csv-Datei ausgegeben, die alle relevanten, L2 normalisierten Ländervektoren dieses einzelnen Modells enthält.

In [None]:
for year in range(1996,2014):
    # Laden der Vektoren, die an den originalen, nicht durch bootstrapping veränderten Datensätzen gebildet wurden
    country_vector_dict = {}
    model = gensim.models.Word2Vec.load(r"Bootstrapping_Chronological\\{}\\word2vec_300dims_iter15_window15_skipgram_timespan{}-{}_fixedModel".format(year+3,year,year+3))
    model.init_sims(replace=True)
    for country in country_list: 
        country_vector_dict[country] = model.wv[country]
    vector_df = pd.DataFrame.from_dict(country_vector_dict, orient="index")
    vector_df.columns = ["Vectordimension_{}".format(dim) for dim in range(0,300)]
    vector_df.index = vector_df.index.set_names(['Country'])
    directory_path = pathlib.Path("Country_Vectors")
    directory_path.mkdir(parents=True, exist_ok=True)
    filename = pathlib.Path("year{}-{}_fixedModel_CountryVectors.csv".format(year,year+3))
    vector_df.to_csv(directory_path / filename)
    # Laden der Vektoren, die durch Bootstrapping-Datensätze gebildet wurden
    for bootstrap_iteration in range(0,25):
        country_vector_dict = {}
        model = gensim.models.Word2Vec.load(r"Bootstrapping_Chronological\{}\word2vec_300dims_iter15_window15_skipgram_timespan{}-{}_bootstrap_iteration{}".format(year+3,year,year+3,bootstrap_iteration))
        # L2-Normalisierung der Vektoren
        model.init_sims(replace=True)
        for country in country_list: 
            country_vector_dict[country] = model.wv[country]
        vector_df = pd.DataFrame.from_dict(country_vector_dict, orient="index")
        vector_df.columns = ["Vectordimension_{}".format(dim) for dim in range(0,300)]
        vector_df.index = vector_df.index.set_names(['Country'])
        filename = pathlib.Path("year{}-{}_bootstrappingIteration{}_CountryVectors.csv".format(year,year+3,bootstrap_iteration))
        vector_df.to_csv(directory_path / filename)

In [34]:
vector_df.head()

Unnamed: 0_level_0,Vectordimension_0,Vectordimension_1,Vectordimension_2,Vectordimension_3,Vectordimension_4,Vectordimension_5,Vectordimension_6,Vectordimension_7,Vectordimension_8,Vectordimension_9,...,Vectordimension_290,Vectordimension_291,Vectordimension_292,Vectordimension_293,Vectordimension_294,Vectordimension_295,Vectordimension_296,Vectordimension_297,Vectordimension_298,Vectordimension_299
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
afghanistan,0.005587,0.025932,0.094816,-0.007751,-0.068533,-0.157167,0.068257,0.076581,0.032549,0.024839,...,0.044878,-0.0451,0.011436,0.040636,0.026354,-0.001247,0.017776,-0.04619,-0.045284,-0.027932
ägypten,-0.037216,0.022236,0.06431,-0.022021,-0.020393,-0.013889,0.075537,0.043095,0.103625,-0.046756,...,0.016334,-0.07701,-0.103417,0.020158,0.040248,-0.107568,0.011474,-0.038543,-0.055785,0.051972
albanien,-0.069117,0.030966,0.093645,0.031854,0.058937,-0.050095,0.097183,-0.031661,0.014058,-0.003238,...,0.001651,0.014554,-0.055281,0.022084,0.0207,-0.101589,-0.035781,-0.071051,0.133743,-0.001339
algerien,0.022252,-0.043974,0.010764,0.029643,0.147472,-0.040997,-0.029762,-0.046748,0.072859,-0.027032,...,0.064526,0.04413,-0.060131,0.0074,0.037834,0.026414,0.040653,-0.069472,0.039739,0.023846
angola,-0.013738,-0.064901,0.05974,-0.081026,0.068955,-0.070978,0.070477,-0.001483,0.071147,-0.131658,...,0.00264,-0.052148,0.074166,0.037155,0.023224,0.044244,-0.016141,0.011847,-0.000193,-0.025665


## Output der Netzwerkdaten

Erstellen der Netzwerkdaten, die als Grundlage für die Clusteranalyse dienen. Jede Zeile enthält ein Länderpaar und die Kosinus-Ähnlichkeit ihrer beiden Wortvektoren.  

In [None]:
for year in range(1996,2014):
    for bootstrap_iteration in range(0,25):
        country_similarity_dict = {"Country_One": [], "Country_Two": [], "Similarity": []}
        model = gensim.models.Word2Vec.load(r"Word2vec\Bootstrapping_Chronological\{}\word2vec_300dims_iter15_window15_skipgram_timespan{}-{}_bootstrap_iteration{}".format(year+3,year,year+3,bootstrap_iteration))
        for index,country in enumerate(country_list[:-1]): 
            for compare_country in country_list[index+1:]:
                if country == compare_country:
                    continue
                else:
                    similarity = model.wv.similarity(country,compare_country)
                    country_similarity_dict["Country_One"].append(country)
                    country_similarity_dict["Country_Two"].append(compare_country)
                    country_similarity_dict["Similarity"].append(similarity)
        similarity_df = pd.DataFrame.from_dict(country_similarity_dict, orient="columns")
        directory_path = pathlib.Path("Country_Vectors_Similarity")
        directory_path.mkdir(parents=True, exist_ok=True)
        filename = pathlib.Path("year{}-{}_bootstrappingIteration{}_CountryVectors_Similarity.csv".format(year,year+3,bootstrap_iteration))
        similarity_df.to_csv(directory_path / filename, index=False)

            

In [33]:
pd.DataFrame.from_dict(country_similarity_dict, orient="columns").head()

Unnamed: 0,Country_One,Country_Two,Similarity
0,afghanistan,ägypten,0.378278
1,afghanistan,albanien,0.314634
2,afghanistan,algerien,0.205009
3,afghanistan,angola,0.34661
4,afghanistan,argentinien,0.16727


## Output der Cluster

In [14]:
import networkx as nx
import community

In [None]:
cluster_df = pd.DataFrame()

similarity_directory = Path("Country_Vectors_Similarity")

for file in similarity_directory.glob("*.csv"):
    filename = file.name
    year = int(filename.split("_")[0].split("-")[0].replace("year",""))
    bootstrap_iteration =  int(filename.split("_")[1].replace("bootstrappingIteration",""))
    similarity_df = pd.read_csv(file)
    G = nx.Graph()
    for row in similarity_df.itertuples(index=False):
        country_one = row[0]
        country_two = row[1]
        similarity = row[2]
        G.add_edge(country_one, country_two, weight= similarity)
    community_dict = community.best_partition(G, weight='weight', random_state=0)
    cluster_dict = {}
    for country in community_dict:
        cluster = community_dict[country]
        if cluster in cluster_dict:
            cluster_dict[cluster].append(country)
        else:
            cluster_dict[cluster] = []
            cluster_dict[cluster].append(country)
    cluster_df = pd.DataFrame.from_dict(cluster_dict,orient="index").transpose()
    cluster_df.columns = ["Cluster_{}".format(cluster) for cluster in cluster_df.columns]
    directory_path = pathlib.Path("Country_Vectors_Cluster")
    directory_path.mkdir(parents=True, exist_ok=True)
    filename = pathlib.Path("year{}-{}_bootstrappingIteration{}_CountryVectors_Cluster.csv".format(year,year+3,bootstrap_iteration))
    cluster_df.to_csv(directory_path / filename, index=False)

In [59]:
cluster_df.head()

Unnamed: 0,Cluster_0,Cluster_1,Cluster_2
0,afghanistan,albanien,argentinien
1,ägypten,armenien,australien
2,algerien,aserbaidschan,belgien
3,angola,belarus,brasilien
4,äthiopien,bosnien_und_herzegowina,bulgarien
