In [43]:
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans, AgglomerativeClustering
import pandas as pd
import numpy as np

In [67]:
data = pd.read_csv("owid-covid-data.csv")
data.fillna(0, inplace=True)
data = data[data.total_cases != 0]    #ignorisemo sve drzave koje dosad nemaju zarazenih

In [68]:
table_columns = data.columns
help_dictionary = {}

datumi = data['date'].drop_duplicates()
datumi.sort_values(inplace=True)

# kreiramo pomocni rjecnik koji kao kljuc ima naziv atributa 
# a kao vrijednost listu prosjecnih vrijednosti navedenog atribuda za svaki dan.
# Ovom prilikom se ignorisu drzave koje dosad nisu imale zarazenih slucajeva
for col in table_columns:
    if col in ['iso_code', 'continent', 'location', 'date', 'tests_units']:
        continue
    help_dictionary[col] = []
        
for datum in datumi:
    datum_podaci = data[data.date==datum]
    for col in table_columns:
        if col in ['iso_code', 'continent', 'location', 'date', 'tests_units']:
            continue
        prosjek = datum_podaci[col].mean()
        help_dictionary[col].append(prosjek)

print(len(help_dictionary.keys()))

54


In [46]:
# u datoteku koreloracija.csv upisujemo korelacionu matricu

s = "KOLONE,"

for col in table_columns:
    if col in ['iso_code', 'continent', 'location', 'date', 'tests_units']:
        continue
    s +="%s," % col 

for col in table_columns:
    if col in ['iso_code', 'continent', 'location', 'date', 'tests_units']:
        continue
    s += "\n%s," % col
    x = np.array(help_dictionary[col]).reshape((-1, 1))
    for col2 in table_columns:
        if col2 in ['iso_code', 'continent', 'location', 'date', 'tests_units']:
            continue
        if col == col2:
            s += ","
            continue
        y = np.array(help_dictionary[col2])

        model = LinearRegression().fit(x, y)
        s += '%.3f,' % model.coef_

with open('koreloracija.csv', 'w') as f:
    f.write(s)

In [69]:
# filtriramo sve ficere koji su u koreloraciji sa nekim od ficera
ignore_columns = ['iso_code', 'continent', 'location', 'date', 'tests_units']
d = len(table_columns)
    
for i in range(d-1):
    col = table_columns[i]
    if col in ignore_columns:
        continue
    y = np.array(help_dictionary[col])
    for j in range(i+1, d):
        col2 = table_columns[j]
        if col2 in ignore_columns:
            continue
        x = np.array(help_dictionary[col2]).reshape((-1, 1))
        model = LinearRegression().fit(x, y)
        if (model.coef_ > 0.7 and model.coef_ < 1.25) or (model.coef_ < -0.7 and model.coef_ > -1.25):
            ignore_columns.append(col2)


print("Kolone koje se ignorisu")
print("-"*50)
for f in ignore_columns:
    print(f)
print("\n" * 2)
print("Kolone koje se koriste prilikom klasterizacije:")
print("-"*50)
for col in table_columns:
    if col not in ignore_columns:
        print(col)

Kolone koje se ignorisu
--------------------------------------------------
iso_code
continent
location
date
tests_units
new_cases_smoothed
tests_per_case
gdp_per_capita
new_tests
new_tests_smoothed
new_deaths_smoothed
new_cases_per_million
new_cases_smoothed_per_million
icu_patients
weekly_icu_admissions_per_million
total_tests_per_thousand
stringency_index
population_density
new_deaths_smoothed_per_million
people_vaccinated_per_hundred
hospital_beds_per_thousand
median_age
aged_65_older
female_smokers
male_smokers
weekly_hosp_admissions_per_million
new_tests_smoothed_per_thousand
total_vaccinations_per_hundred
diabetes_prevalence
new_vaccinations_smoothed
cardiovasc_death_rate
handwashing_facilities
life_expectancy



Kolone koje se koriste prilikom klasterizacije:
--------------------------------------------------
total_cases
new_cases
total_deaths
new_deaths
total_cases_per_million
total_deaths_per_million
new_deaths_per_million
reproduction_rate
icu_patients_per_million
hosp_patien

In [70]:
drzave = dict()
table_columns = [col for col in table_columns if col not in ignore_columns]

#for d_ in data.rows:
#    print(d_)

lista_drzava = list(set(data['iso_code']))
lista_drzava = [drzava for drzava in lista_drzava if not drzava.startswith("OWID_")]
#lista_drzava.remove('OWID_AFR')
#lista_drzava.remove('OWID_ASI')
#lista_drzava.remove('OWID_WRL')
#lista_drzava.remove('OWID_EUN')
#lista_drzava.remove('OWID_EUR')
#lista_drzava.remove('OWID_KOS')
#lista_drzava.remove('OWID_INT')
#lista_drzava.remove('OWID_NAM')
#lista_drzava.remove('OWID_OCE')
#lista_drzava.remove('OWID_SAM')

for drzava in lista_drzava:
    select_color = data.loc[data['iso_code'] == drzava]
    drzave[drzava] = {}
    for col in table_columns:
        drzave[drzava][col] = select_color[col]
    
print(drzave['BIH'])


{'total_cases': 8203         2.0
8204         2.0
8205         3.0
8206         3.0
8207         3.0
          ...   
8545    124696.0
8546    125123.0
8547    125402.0
8548    125402.0
8549    125402.0
Name: total_cases, Length: 347, dtype: float64, 'new_cases': 8203      2.0
8204      0.0
8205      1.0
8206      0.0
8207      0.0
        ...  
8545    253.0
8546    427.0
8547    279.0
8548      0.0
8549      0.0
Name: new_cases, Length: 347, dtype: float64, 'total_deaths': 8203       0.0
8204       0.0
8205       0.0
8206       0.0
8207       0.0
         ...  
8545    4853.0
8546    4868.0
8547    4881.0
8548    4881.0
8549    4881.0
Name: total_deaths, Length: 347, dtype: float64, 'new_deaths': 8203     0.0
8204     0.0
8205     0.0
8206     0.0
8207     0.0
        ... 
8545    19.0
8546    15.0
8547    13.0
8548     0.0
8549     0.0
Name: new_deaths, Length: 347, dtype: float64, 'total_cases_per_million': 8203        0.610
8204        0.610
8205        0.914
8206        0.914
820

In [71]:
def agglomerative_klasterizacija(ListaPodataka):
  d = {}
  # broj klastera ok 4 do 6
  for k in range(4, 7):
    agglomeratives = AgglomerativeClustering(n_clusters=k).fit(ListaPodataka)
    d[k] = agglomeratives
  return d

In [72]:
def getKlasteri(listaDrzava, listaKlastera):
  klasteri = dict()
  for idx, drzava in enumerate(listaDrzava):
    klaster = listaKlastera[idx]
    if klaster in klasteri:
      klasteri[klaster].append(drzava)
    else:
      klasteri[klaster] = [drzava]
  return klasteri

In [73]:
def klasterizacijaPoAtributu(atribut):
    if atribut not in table_columns:
        print("uneeni atribut se ne nalazi u skuou nekorelacionih atributa")
        return None
    podaci = []
    maksimalna_duzina = max([len(drzave[drzava][atribut]) for drzava in lista_drzava ])
    for drzava in lista_drzava:
        l = drzave[drzava][atribut].tolist()
        avg = sum(l)/len(l)
        l1 = [avg] * (maksimalna_duzina - len(l))
        l.extend(l1)
        podaci.append(l)


    agglomerative = agglomerative_klasterizacija(podaci)
    for k, km in agglomerative.items():
        print()
        print("K =", k)
        listaKlastera = km.labels_

        km = getKlasteri(lista_drzava, listaKlastera)
        for klaster, lista in km.items():
          print(klaster, ":", lista)


In [74]:
klasterizacijaPoAtributu("new_deaths_per_million")


K = 4
0 : ['CIV', 'EST', 'ZMB', 'SLB', 'KWT', 'SAU', 'MUS', 'STP', 'LSO', 'JAM', 'BHS', 'MLI', 'KAZ', 'SYR', 'NER', 'GHA', 'GMB', 'BDI', 'MWI', 'SOM', 'EGY', 'GNQ', 'MDG', 'GRD', 'MOZ', 'TZA', 'SLE', 'CUB', 'BFA', 'GIN', 'LCA', 'MRT', 'PNG', 'NAM', 'ISR', 'NIC', 'UGA', 'UKR', 'ERI', 'FIN', 'FSM', 'TUR', 'DZA', 'NLD', 'TGO', 'PSE', 'BHR', 'CYP', 'PRY', 'BTN', 'TJK', 'NPL', 'HND', 'JPN', 'LBY', 'SEN', 'SGP', 'KOR', 'BEN', 'MDV', 'KGZ', 'AFG', 'BGD', 'COG', 'CAF', 'ISL', 'RWA', 'MYS', 'URY', 'IND', 'CHN', 'CRI', 'YEM', 'GUY', 'AZE', 'MHL', 'LKA', 'VAT', 'TWN', 'COD', 'ARE', 'AUS', 'BRB', 'THA', 'BRN', 'IRN', 'GTM', 'ATG', 'VUT', 'DOM', 'OMN', 'ECU', 'IRQ', 'QAT', 'ALB', 'PHL', 'SLV', 'DJI', 'VEN', 'DNK', 'ZWE', 'RUS', 'PAK', 'KHM', 'AGO', 'TCD', 'CMR', 'BLR', 'NZL', 'VNM', 'NGA', 'BWA', 'WSM', 'MMR', 'TTO', 'NOR', 'DMA', 'LBR', 'HTI', 'LAO', 'KNA', 'SUR', 'JOR', 'VCT', 'SDN', 'MNG', 'CAN', 'SSD', 'MAR', 'GAB', 'KEN', 'UZB', 'CPV', 'IDN', 'FJI', 'SYC', 'TLS', 'ETH', 'GNB', 'COM']
1 : ['ME

In [75]:
klasterizacijaPoAtributu("hosp_patients_per_million")


K = 4
3 : ['CIV', 'MEX', 'ARG', 'ZMB', 'SLB', 'KWT', 'SAU', 'MUS', 'MKD', 'STP', 'LSO', 'JAM', 'BHS', 'LIE', 'MLI', 'KAZ', 'SYR', 'SWZ', 'NER', 'SRB', 'GHA', 'GMB', 'BDI', 'MWI', 'SOM', 'EGY', 'GNQ', 'MDG', 'GRD', 'MOZ', 'ROU', 'TZA', 'ZAF', 'BRA', 'SLE', 'CUB', 'BFA', 'GIN', 'LCA', 'MRT', 'PNG', 'MNE', 'NAM', 'ARM', 'BIH', 'NIC', 'UGA', 'UKR', 'ERI', 'FIN', 'FSM', 'TUR', 'DZA', 'TGO', 'PSE', 'BHR', 'PRY', 'BTN', 'TJK', 'NPL', 'HND', 'PAN', 'JPN', 'LBY', 'SEN', 'SGP', 'KOR', 'BEN', 'MDV', 'KGZ', 'AND', 'GRC', 'AFG', 'BGD', 'COG', 'CAF', 'GEO', 'TUN', 'RWA', 'MYS', 'URY', 'IND', 'CHN', 'CRI', 'YEM', 'SMR', 'GUY', 'AZE', 'MHL', 'LKA', 'VAT', 'TWN', 'MDA', 'COD', 'MCO', 'ARE', 'AUS', 'BRB', 'THA', 'LBN', 'BOL', 'BRN', 'COL', 'IRN', 'GTM', 'ATG', 'VUT', 'DOM', 'OMN', 'ECU', 'IRQ', 'QAT', 'ALB', 'PHL', 'SLV', 'DJI', 'VEN', 'ZWE', 'RUS', 'PAK', 'KHM', 'AGO', 'TCD', 'DEU', 'CMR', 'BLZ', 'BLR', 'NZL', 'VNM', 'CHE', 'NGA', 'BWA', 'WSM', 'MMR', 'TTO', 'NOR', 'DMA', 'LBR', 'HTI', 'LAO', 'KNA', '

In [76]:
klasterizacijaPoAtributu("new_tests_per_thousand")


K = 4
3 : ['CIV', 'MEX', 'ARG', 'ZMB', 'SLB', 'MUS', 'MKD', 'STP', 'LSO', 'JAM', 'BHS', 'LIE', 'MLI', 'SYR', 'SWZ', 'NER', 'BGR', 'GHA', 'GMB', 'BDI', 'MWI', 'SOM', 'EGY', 'POL', 'GNQ', 'MDG', 'GRD', 'MOZ', 'ROU', 'TZA', 'ZAF', 'BRA', 'SLE', 'CUB', 'BFA', 'GIN', 'LCA', 'MRT', 'PNG', 'MNE', 'NAM', 'ARM', 'BIH', 'NIC', 'UGA', 'UKR', 'ERI', 'FSM', 'DZA', 'NLD', 'TGO', 'PSE', 'PRY', 'TJK', 'NPL', 'HND', 'JPN', 'LBY', 'SEN', 'SGP', 'KOR', 'BEN', 'KGZ', 'AND', 'AFG', 'BGD', 'COG', 'CAF', 'GEO', 'TUN', 'RWA', 'MYS', 'URY', 'IND', 'CHN', 'CRI', 'YEM', 'SMR', 'GUY', 'AZE', 'MHL', 'LKA', 'VAT', 'TWN', 'MDA', 'COD', 'MCO', 'AUS', 'BRB', 'THA', 'LBN', 'BOL', 'BRN', 'COL', 'IRN', 'GTM', 'ATG', 'VUT', 'DOM', 'OMN', 'ECU', 'IRQ', 'ALB', 'PHL', 'ESP', 'SLV', 'DJI', 'VEN', 'ZWE', 'PAK', 'KHM', 'AGO', 'TCD', 'DEU', 'CMR', 'BLZ', 'BLR', 'NZL', 'VNM', 'NGA', 'BWA', 'WSM', 'MMR', 'TTO', 'DMA', 'LBR', 'HTI', 'LAO', 'KNA', 'SUR', 'VCT', 'SDN', 'SSD', 'MAR', 'GAB', 'KEN', 'UZB', 'CPV', 'IDN', 'FJI', 'SYC', '