# Klasterizacija

In [88]:
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans, AgglomerativeClustering
import pandas as pd
import numpy as np
import random

In [89]:
dataOriginal = pd.read_csv("owid-covid-data.csv")
dataKlaster = data.fillna(0)

dataKlaster = dataKlaster[dataKlaster.total_cases != 0]    #ignorisemo sve drzave koje dosad nemaju zarazenih

In [90]:
table_columns = dataKlaster.columns
help_dictionary = {}

datumi = dataKlaster['date'].drop_duplicates()
datumi.sort_values(inplace=True)

# kreiramo pomocni rjecnik koji kao kljuc ima naziv atributa 
# a kao vrijednost listu prosjecnih vrijednosti navedenog atribuda za svaki dan.
# Ovom prilikom se ignorisu drzave koje dosad nisu imale zarazenih slucajeva
for col in table_columns:
    if col in ['iso_code', 'continent', 'location', 'date', 'tests_units']:
        continue
    help_dictionary[col] = []
        
for datum in datumi:
    datum_podaci = dataKlaster[dataKlaster.date==datum]
    for col in table_columns:
        if col in ['iso_code', 'continent', 'location', 'date', 'tests_units']:
            continue
        prosjek = datum_podaci[col].mean()
        help_dictionary[col].append(prosjek)

print(len(help_dictionary.keys()))

54


In [91]:
# u datoteku koreloracija.csv upisujemo korelacionu matricu

s = "KOLONE,"

for col in table_columns:
    if col in ['iso_code', 'continent', 'location', 'date', 'tests_units']:
        continue
    s +="%s," % col 

for col in table_columns:
    if col in ['iso_code', 'continent', 'location', 'date', 'tests_units']:
        continue
    s += "\n%s," % col
    x = np.array(help_dictionary[col]).reshape((-1, 1))
    for col2 in table_columns:
        if col2 in ['iso_code', 'continent', 'location', 'date', 'tests_units']:
            continue
        if col == col2:
            s += ","
            continue
        y = np.array(help_dictionary[col2])

        model = LinearRegression().fit(x, y)
        s += '%.3f,' % model.coef_

with open('koreloracija.csv', 'w') as f:
    f.write(s)

In [92]:
# filtriramo sve ficere koji su u koreloraciji sa nekim od ficera
ignore_columns = ['iso_code', 'continent', 'location', 'date', 'tests_units']
d = len(table_columns)
    
for i in range(d-1):
    col = table_columns[i]
    if col in ignore_columns:
        continue
    y = np.array(help_dictionary[col])
    for j in range(i+1, d):
        col2 = table_columns[j]
        if col2 in ignore_columns:
            continue
        x = np.array(help_dictionary[col2]).reshape((-1, 1))
        model = LinearRegression().fit(x, y)
        if (model.coef_ > 0.7 and model.coef_ < 1.25) or (model.coef_ < -0.7 and model.coef_ > -1.25):
            ignore_columns.append(col2)


print("Kolone koje se ignorisu")
print("-"*50)
for f in ignore_columns:
    print(f)
print("\n" * 2)
print("Kolone koje se koriste prilikom klasterizacije:")
print("-"*50)
for col in table_columns:
    if col not in ignore_columns:
        print(col)

Kolone koje se ignorisu
--------------------------------------------------
iso_code
continent
location
date
tests_units
new_cases_smoothed
tests_per_case
gdp_per_capita
new_tests
new_tests_smoothed
new_deaths_smoothed
new_cases_per_million
new_cases_smoothed_per_million
icu_patients
weekly_icu_admissions_per_million
total_tests_per_thousand
stringency_index
population_density
new_deaths_smoothed_per_million
people_vaccinated_per_hundred
hospital_beds_per_thousand
median_age
aged_65_older
female_smokers
male_smokers
weekly_hosp_admissions_per_million
new_tests_smoothed_per_thousand
total_vaccinations_per_hundred
diabetes_prevalence
new_vaccinations_smoothed
cardiovasc_death_rate
handwashing_facilities
life_expectancy



Kolone koje se koriste prilikom klasterizacije:
--------------------------------------------------
total_cases
new_cases
total_deaths
new_deaths
total_cases_per_million
total_deaths_per_million
new_deaths_per_million
reproduction_rate
icu_patients_per_million
hosp_patien

In [93]:
drzave = dict()
table_columns = [col for col in table_columns if col not in ignore_columns]

#for d_ in data.rows:
#    print(d_)

lista_drzava = list(set(dataKlaster['iso_code']))
lista_drzava = [drzava for drzava in lista_drzava if not drzava.startswith("OWID_")]
#lista_drzava.remove('OWID_AFR')
#lista_drzava.remove('OWID_ASI')
#lista_drzava.remove('OWID_WRL')
#lista_drzava.remove('OWID_EUN')
#lista_drzava.remove('OWID_EUR')
#lista_drzava.remove('OWID_KOS')
#lista_drzava.remove('OWID_INT')
#lista_drzava.remove('OWID_NAM')
#lista_drzava.remove('OWID_OCE')
#lista_drzava.remove('OWID_SAM')

for drzava in lista_drzava:
    select_color = data.loc[dataKlaster['iso_code'] == drzava]
    drzave[drzava] = {}
    for col in table_columns:
        drzave[drzava][col] = select_color[col]


In [94]:
def agglomerative_klasterizacija(ListaPodataka, brojKlastera=4):
  agglomeratives = AgglomerativeClustering(n_clusters=brojKlastera).fit(ListaPodataka)
  return agglomeratives

In [95]:
def getKlasteri(listaDrzava, listaKlastera):
  klasteri = dict()
  for idx, drzava in enumerate(listaDrzava):
    klaster = listaKlastera[idx]
    if klaster in klasteri:
      klasteri[klaster].append(drzava)
    else:
      klasteri[klaster] = [drzava]
  return klasteri

In [96]:
def klasterizacijaPoAtributu(atribut, brojKlastera=0):
    if atribut not in table_columns:
        print("uneeni atribut se ne nalazi u skuou nekorelacionih atributa")
        return None
    podaci = []
    maksimalna_duzina = max([len(drzave[drzava][atribut]) for drzava in lista_drzava ])
    for drzava in lista_drzava:
        l = drzave[drzava][atribut].tolist()
        avg = sum(l)/len(l)
        l1 = [avg] * (maksimalna_duzina - len(l))
        l.extend(l1)
        podaci.append(l)
    
    if brojKlastera > 0:
        agglomerative = agglomerative_klasterizacija(podaci, brojKlastera)
        listaKlastera = agglomerative.labels_
        drzaveKlasteri = getKlasteri(lista_drzava, listaKlastera)
        return drzaveKlasteri

    for k in range(4, 7):
        agglomerative = agglomerative_klasterizacija(podaci, k)
        print()
        print("K =", k)
        listaKlastera = agglomerative.labels_
        km = getKlasteri(lista_drzava, listaKlastera)
        for klaster, lista in km.items():
          print(klaster, ":", lista)


In [97]:
klasterizacijaPoAtributu("new_deaths_per_million")


K = 4
0 : ['SDN', 'NER', 'MHL', 'FIN', 'GHA', 'MWI', 'DMA', 'KAZ', 'NOR', 'DZA', 'SAU', 'ERI', 'JAM', 'ZMB', 'MNG', 'PHL', 'TGO', 'CAF', 'BEN', 'IRN', 'BRB', 'TUR', 'MRT', 'DOM', 'GRD', 'SYR', 'TCD', 'LSO', 'BHR', 'KWT', 'JOR', 'MDV', 'NLD', 'GNB', 'MOZ', 'BLR', 'VAT', 'JPN', 'AFG', 'IRQ', 'CUB', 'QAT', 'BWA', 'CIV', 'YEM', 'AUS', 'PRY', 'FJI', 'COM', 'MLI', 'KNA', 'VNM', 'ECU', 'VUT', 'GNQ', 'PNG', 'TZA', 'URY', 'KEN', 'RUS', 'VCT', 'CAN', 'GIN', 'SEN', 'SOM', 'EST', 'IND', 'LBY', 'MAR', 'FSM', 'MUS', 'RWA', 'NZL', 'SLE', 'LBR', 'GMB', 'BDI', 'MMR', 'LAO', 'NGA', 'CMR', 'THA', 'STP', 'BGD', 'KHM', 'ETH', 'GTM', 'VEN', 'ISR', 'AZE', 'MDG', 'NAM', 'ALB', 'KOR', 'GAB', 'TJK', 'COD', 'ATG', 'DNK', 'SLV', 'BFA', 'UGA', 'BTN', 'EGY', 'ISL', 'CPV', 'NPL', 'PAK', 'SUR', 'TWN', 'UZB', 'UKR', 'ZWE', 'CHN', 'BRN', 'DJI', 'HTI', 'MYS', 'BHS', 'LKA', 'TLS', 'TTO', 'SLB', 'LCA', 'ARE', 'AGO', 'IDN', 'SYC', 'SSD', 'CYP', 'KGZ', 'CRI', 'PSE', 'WSM', 'OMN', 'SGP', 'COG', 'HND', 'NIC', 'GUY']
1 : ['LB

In [98]:
klasterizacijaPoAtributu("hosp_patients_per_million")


K = 4
3 : ['SDN', 'NER', 'MHL', 'FIN', 'GHA', 'LBN', 'MWI', 'DMA', 'KAZ', 'NOR', 'DZA', 'SAU', 'ERI', 'JAM', 'ZMB', 'MNG', 'CHE', 'PHL', 'TGO', 'CAF', 'BEN', 'IRN', 'BRB', 'TUR', 'LIE', 'MRT', 'DOM', 'GRD', 'DEU', 'SYR', 'TCD', 'LSO', 'BHR', 'KWT', 'JOR', 'MDV', 'MEX', 'GNB', 'MOZ', 'BLZ', 'BLR', 'VAT', 'MDA', 'JPN', 'AFG', 'IRQ', 'CUB', 'QAT', 'BWA', 'CIV', 'SRB', 'YEM', 'AUS', 'PRY', 'FJI', 'CHL', 'MCO', 'COM', 'MLI', 'TUN', 'KNA', 'VNM', 'ECU', 'VUT', 'MKD', 'GNQ', 'PNG', 'TZA', 'URY', 'KEN', 'GEO', 'RUS', 'VCT', 'SWZ', 'SMR', 'GIN', 'SEN', 'SOM', 'BOL', 'IND', 'LBY', 'MAR', 'FSM', 'MUS', 'MNE', 'RWA', 'NZL', 'SLE', 'LBR', 'GMB', 'ZAF', 'BDI', 'MMR', 'LAO', 'NGA', 'CMR', 'THA', 'STP', 'BGD', 'KHM', 'ETH', 'GTM', 'BIH', 'VEN', 'AZE', 'MLT', 'MDG', 'NAM', 'ALB', 'ARG', 'GRC', 'KOR', 'GAB', 'TJK', 'COD', 'PER', 'ATG', 'ARM', 'SLV', 'BFA', 'UGA', 'BTN', 'EGY', 'CPV', 'NPL', 'PAK', 'SUR', 'TWN', 'UZB', 'UKR', 'ZWE', 'CHN', 'BRN', 'DJI', 'HTI', 'MYS', 'BHS', 'LKA', 'ROU', 'TLS', 'TTO', '

In [99]:
klasterizacijaPoAtributu("new_tests_per_thousand")


K = 4
3 : ['SDN', 'NER', 'MHL', 'GHA', 'LBN', 'MWI', 'DMA', 'DZA', 'ERI', 'JAM', 'ZMB', 'PHL', 'TGO', 'CAF', 'BEN', 'IRN', 'BRB', 'LIE', 'MRT', 'DOM', 'ESP', 'GRD', 'DEU', 'SYR', 'TCD', 'LSO', 'MEX', 'NLD', 'GNB', 'MOZ', 'BLZ', 'BLR', 'VAT', 'MDA', 'JPN', 'AFG', 'IRQ', 'CUB', 'BWA', 'CIV', 'YEM', 'AUS', 'PRY', 'FJI', 'MCO', 'COM', 'MLI', 'TUN', 'KNA', 'VNM', 'ECU', 'VUT', 'MKD', 'GNQ', 'PNG', 'TZA', 'URY', 'KEN', 'GEO', 'VCT', 'SWZ', 'SMR', 'GIN', 'SEN', 'SOM', 'BOL', 'IND', 'LBY', 'MAR', 'FSM', 'MUS', 'MNE', 'RWA', 'NZL', 'SLE', 'LBR', 'GMB', 'ZAF', 'BDI', 'MMR', 'LAO', 'NGA', 'CMR', 'THA', 'STP', 'BGD', 'KHM', 'ETH', 'GTM', 'POL', 'BIH', 'VEN', 'BGR', 'AZE', 'MDG', 'NAM', 'ALB', 'ARG', 'KOR', 'GAB', 'TJK', 'COD', 'PER', 'ATG', 'ARM', 'SLV', 'BFA', 'UGA', 'EGY', 'CPV', 'NPL', 'PAK', 'SUR', 'TWN', 'UZB', 'UKR', 'ZWE', 'CHN', 'BRN', 'DJI', 'HTI', 'MYS', 'BHS', 'LKA', 'ROU', 'TLS', 'TTO', 'SLB', 'LCA', 'AGO', 'BRA', 'IDN', 'SYC', 'SSD', 'KGZ', 'CRI', 'AND', 'PSE', 'WSM', 'COL', 'OMN', '

In [100]:
klasterizacijaPoAtributu("population")


K = 4
0 : ['SDN', 'NER', 'GHA', 'DZA', 'SAU', 'PHL', 'IRN', 'TUR', 'ESP', 'DEU', 'MEX', 'MOZ', 'JPN', 'AFG', 'IRQ', 'CIV', 'YEM', 'AUS', 'VNM', 'TZA', 'KEN', 'CAN', 'MAR', 'ITA', 'ZAF', 'MMR', 'CMR', 'THA', 'ETH', 'POL', 'VEN', 'MDG', 'ARG', 'KOR', 'COD', 'GBR', 'PER', 'UGA', 'EGY', 'NPL', 'TWN', 'UZB', 'UKR', 'MYS', 'FRA', 'AGO', 'COL']
1 : ['MHL', 'FIN', 'LBN', 'MWI', 'DMA', 'KAZ', 'NOR', 'ERI', 'JAM', 'ZMB', 'MNG', 'CHE', 'TGO', 'CAF', 'BEN', 'BRB', 'LIE', 'MRT', 'DOM', 'GRD', 'SYR', 'TCD', 'LSO', 'BHR', 'KWT', 'JOR', 'LUX', 'MDV', 'SVK', 'NLD', 'GNB', 'BLZ', 'BLR', 'VAT', 'MDA', 'BEL', 'CUB', 'QAT', 'BWA', 'SRB', 'PRY', 'FJI', 'CHL', 'MCO', 'HRV', 'COM', 'MLI', 'TUN', 'KNA', 'ECU', 'VUT', 'MKD', 'CZE', 'GNQ', 'PNG', 'URY', 'GEO', 'VCT', 'SWZ', 'SMR', 'GIN', 'SEN', 'SOM', 'EST', 'BOL', 'LBY', 'FSM', 'MUS', 'MNE', 'AUT', 'RWA', 'NZL', 'SLE', 'LBR', 'GMB', 'BDI', 'LAO', 'STP', 'KHM', 'GTM', 'BIH', 'ISR', 'BGR', 'AZE', 'MLT', 'NAM', 'ALB', 'LTU', 'GRC', 'SWE', 'GAB', 'TJK', 'SVN', 'AT

# Korelacija atributa unutar jednog klastera i u razlicitim klasterima

In [101]:
dataKlasa = dataOriginal[dataOriginal.total_cases != 0]
dataKlasa.fillna(0, inplace=True)
table_columns = dataKlasa.columns

for drzava in lista_drzava:
    select_color = dataKlasa.loc[dataKlasa['iso_code'] == drzava]
    drzave[drzava] = {}
    for col in table_columns:
        if col in ['iso_code', 'continent', 'location', 'date', 'tests_units']:
            continue
        drzave[drzava][col] = select_color[col]

In [120]:
def korelacijaAtributaUnutarIVanKlastera(klasteri, atribut1, atribut2):
    klasteriKeys = klasteri.keys()
    # na slucajan nacin biramo po jednu drzavu iz svakog klastera
    randomDrzave = {}
    brojKlastera = len(klasteriKeys)
    for k in klasteriKeys:
        velicinaKlastera = len(klasteri[k])
        if velicinaKlastera == 1:
            randomDrzave[k] = klasteri[k]
            continue
        d1 = random.choice(klasteri[k])
        d2 = random.choice(klasteri[k])
        while d1 == d2:
            d2 = random.choice(klasteri[k])
        
        randomDrzave[k] = [d1, d2]
    
    for k, ds in randomDrzave.items():
        print("K:", k)
        for d in ds:
            X = np.array(drzave[d][atribut1]).reshape((-1, 1))
            Y = np.array(drzave[d][atribut2])
            model = LinearRegression().fit(X, Y)
            print("%s: %.3f" %(d, model.coef_))


In [128]:
klasteri = klasterizacijaPoAtributu("new_tests_per_thousand", 4)
korelacijaAtributaUnutarIVanKlastera(klasteri, "new_tests_per_thousand", "new_cases_per_million")

K: 3
GNQ: 0.000
KEN: 24.591
K: 1
USA: 98.382
RUS: 32.904
K: 0
SVK: 5.527
CYP: 12.474
K: 2
AUT: 0.771


In [129]:
klasteri = klasterizacijaPoAtributu("population", 4)
korelacijaAtributaUnutarIVanKlastera(klasteri, "new_tests_per_thousand", "new_cases_per_million")

K: 0
MYS: 56.612
UZB: 0.000
K: 1
CZE: 146.047
JAM: 30.822
K: 2
IDN: 122.517
NGA: 43.775
K: 3
CHN: 0.000
IND: 47.043
