In [41]:
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
import numpy as np
import scipy.stats as stats
from scipy.optimize import curve_fit


In [2]:
covid = pd.read_csv("/Users/glebsokolov/Downloads/data/covid_data.csv")
cardio_base = pd.read_csv("/Users/glebsokolov/Downloads/data/cardio_base.csv")
cardio_alco = pd.read_csv("/Users/glebsokolov/Downloads/data/cardio_alco.csv", sep=";")
covid.head()
cardio_base.head()
cardio_alco.head()


Unnamed: 0,location,date,new_cases,new_deaths,population,aged_65_older_percent,gdp_per_capita,hospital_beds_per_thousand
0,Afghanistan,2019-12-31,0,0,38928341.0,2.581,1803.987,0.5
1,Afghanistan,2020-01-01,0,0,38928341.0,2.581,1803.987,0.5
2,Afghanistan,2020-01-02,0,0,38928341.0,2.581,1803.987,0.5
3,Afghanistan,2020-01-03,0,0,38928341.0,2.581,1803.987,0.5
4,Afghanistan,2020-01-04,0,0,38928341.0,2.581,1803.987,0.5


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,smoke
0,0,18393,2,168,62.0,110,80,1,0
1,1,20228,1,156,85.0,140,90,3,0
2,2,18857,1,165,64.0,130,70,3,0
3,3,17623,2,169,82.0,150,100,1,0
4,4,17474,1,156,56.0,100,60,1,0


Unnamed: 0,id,alco
0,44,0
1,45,0
2,46,0
3,47,0
4,49,0


In [3]:
cardio_base[">50"] = cardio_base["age"].apply(lambda x: True if x / 365 > 50 else False)


In [12]:
cardio_base.groupby(">50").mean()["cholesterol"]


>50
False    1.243186
True     1.421436
Name: cholesterol, dtype: float64

In [14]:
cardio_base.groupby("gender").mean()["height"]


gender
1    161.355612
2    169.947895
Name: height, dtype: float64

In [16]:
cardio_base.groupby("gender").sum()["smoke"]


gender
1     813
2    5356
Name: smoke, dtype: int64

In [19]:
cardio_base["height"].quantile(0.99)


184.0

In [23]:
correl = cardio_base.corr()


In [28]:
correl = correl.replace(1, 0)


In [35]:
correl = correl.drop(">50", axis=1).drop(">50", axis=0)


In [36]:
correl.style.highlight_max(color="lightgreen", axis=0)


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,smoke
id,0.0,0.003457,0.003502,-0.003038,-0.00183,0.003356,-0.002529,0.006106,-0.003699
age,0.003457,0.0,-0.022811,-0.081515,0.053684,0.020764,0.017647,0.154424,-0.047633
gender,0.003502,-0.022811,0.0,0.499033,0.155406,0.006005,0.015254,-0.035821,0.338135
height,-0.003038,-0.081515,0.499033,0.0,0.290968,0.005488,0.00615,-0.050226,0.187989
weight,-0.00183,0.053684,0.155406,0.290968,0.0,0.030702,0.04371,0.141768,0.06778
ap_hi,0.003356,0.020764,0.006005,0.005488,0.030702,0.0,0.016086,0.023778,-0.000922
ap_lo,-0.002529,0.017647,0.015254,0.00615,0.04371,0.016086,0.0,0.024019,0.005186
cholesterol,0.006106,0.154424,-0.035821,-0.050226,0.141768,0.023778,0.024019,0.0,0.010354
smoke,-0.003699,-0.047633,0.338135,0.187989,0.06778,-0.000922,0.005186,0.010354,0.0


In [39]:
heightleft = cardio_base["height"].mean() - 2 * cardio_base["height"].std()
heightright = cardio_base["height"].mean() + 2 * cardio_base["height"].std()


In [40]:
cardio_base["higher"] = cardio_base["height"].apply(
    lambda x: True if heightleft <= x <= heightright else False
)


In [42]:
cardio_base["higher"].value_counts(normalize=True)


True     0.966643
False    0.033357
Name: higher, dtype: float64

In [4]:
cardio_merged = cardio_base.merge(cardio_alco, "left", "id").dropna()


In [5]:
cardio_merged.groupby(">50").sum()["alco"]


>50
False    1084.0
True     1957.0
Name: alco, dtype: float64

In [54]:
cardio_merged.groupby(">50").count()["alco"]


>50
False    17406
True     39497
Name: alco, dtype: int64

In [57]:
(1957 / 39497) * 100


4.954806694179305

In [9]:
no_smoke, smoke = cardio_merged.query("smoke==0"), cardio_merged.query("smoke==1")


In [10]:
import numpy as np, statsmodels.stats.api as sms

X1, X2 = smoke["ap_hi"], no_smoke["ap_hi"]

cm = sms.CompareMeans(sms.DescrStatsW(X1), sms.DescrStatsW(X2))
cm.tconfint_diff(usevar="unequal")


In [12]:
X1, X2 = smoke["cholesterol"], no_smoke["cholesterol"]

cm = sms.CompareMeans(sms.DescrStatsW(X1), sms.DescrStatsW(X2))
cm.tconfint_diff(usevar="unequal")


(0.005145485670791695, 0.04462607557460048)

In [14]:
X1, X2 = smoke["weight"], no_smoke["weight"]

cm = sms.CompareMeans(sms.DescrStatsW(X1), sms.DescrStatsW(X2))
cm.tconfint_diff(usevar="unequal")


(3.0689501010285567, 3.9367553379327007)

In [17]:
X1, X2 = (
    cardio_merged.query("gender==1")["ap_hi"],
    cardio_merged.query("gender==2")["ap_hi"],
)

cm = sms.CompareMeans(sms.DescrStatsW(X1), sms.DescrStatsW(X2))
cm.tconfint_diff(usevar="unequal")


(-4.956625183477737, 0.5820920096339779)

In [18]:
X1, X2 = (
    cardio_merged.query("gender==1")["ap_lo"],
    cardio_merged.query("gender==2")["ap_lo"],
)

cm = sms.CompareMeans(sms.DescrStatsW(X1), sms.DescrStatsW(X2))
cm.tconfint_diff(usevar="unequal")


(-7.5988112806128925, -0.8905736676738054)

In [20]:
cardio_merged.query("gender==1")["ap_lo"].mean()
cardio_merged.query("gender==2")["ap_lo"].mean()


95.26387723619042

99.50856971033377

In [31]:
covid.date = pd.to_datetime(covid.date)


In [34]:
italy, germany = (
    covid[covid["location"] == "Italy"].set_index("date").sort_index(),
    covid[covid["location"] == "Germany"].set_index("date").sort_index(),
)


In [38]:
germany


Unnamed: 0_level_0,location,new_cases,new_deaths,population,aged_65_older_percent,gdp_per_capita,hospital_beds_per_thousand
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-12-31,Germany,0,0,83783945.0,21.453,45229.245,8.0
2020-01-01,Germany,0,0,83783945.0,21.453,45229.245,8.0
2020-01-02,Germany,0,0,83783945.0,21.453,45229.245,8.0
2020-01-03,Germany,0,0,83783945.0,21.453,45229.245,8.0
2020-01-04,Germany,0,0,83783945.0,21.453,45229.245,8.0
...,...,...,...,...,...,...,...
2020-06-06,Germany,407,33,83783945.0,21.453,45229.245,8.0
2020-06-07,Germany,301,22,83783945.0,21.453,45229.245,8.0
2020-06-08,Germany,214,6,83783945.0,21.453,45229.245,8.0
2020-06-09,Germany,350,37,83783945.0,21.453,45229.245,8.0


In [40]:
d = italy["new_cases"].cumsum() - germany["new_cases"].cumsum()
d[d > 10000]


date
2020-03-12    10895
2020-03-13    12744
2020-03-14    14598
2020-03-15    17362
2020-03-16    19142
              ...  
2020-06-06    50853
2020-06-07    50822
2020-06-08    50805
2020-06-09    50735
2020-06-10    50700
Name: new_cases, Length: 91, dtype: int64

In [55]:
italy_cum, germ_cum = italy["new_cases"].cumsum(), germany["new_cases"].cumsum()

italy_cum = italy_cum[
    (italy_cum.index >= pd.to_datetime("2020-02-28"))
    & (italy_cum.index <= pd.to_datetime("2020-03-20"))
]
germ_cum = germ_cum[
    (germ_cum.index >= pd.to_datetime("2020-02-28"))
    & (germ_cum.index <= pd.to_datetime("2020-03-20"))
]


In [56]:
italy_cum


date
2020-02-28      650
2020-02-29      888
2020-03-01     1128
2020-03-02     1689
2020-03-03     2036
2020-03-04     2502
2020-03-05     3089
2020-03-06     3858
2020-03-07     4636
2020-03-08     5883
2020-03-09     7375
2020-03-10     9172
2020-03-11    10149
2020-03-12    12462
2020-03-13    15113
2020-03-14    17660
2020-03-15    21157
2020-03-16    23980
2020-03-17    27980
2020-03-18    31506
2020-03-19    35713
2020-03-20    41035
Name: new_cases, dtype: int64

In [108]:
def expon(A, B, x):
    return A * np.e ** (B * x)


In [64]:
italy


Unnamed: 0_level_0,location,new_cases,new_deaths,population,aged_65_older_percent,gdp_per_capita,hospital_beds_per_thousand
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-12-31,Italy,0,0,60461828.0,23.021,35220.084,3.18
2020-01-01,Italy,0,0,60461828.0,23.021,35220.084,3.18
2020-01-02,Italy,0,0,60461828.0,23.021,35220.084,3.18
2020-01-03,Italy,0,0,60461828.0,23.021,35220.084,3.18
2020-01-04,Italy,0,0,60461828.0,23.021,35220.084,3.18
...,...,...,...,...,...,...,...
2020-06-06,Italy,518,85,60461828.0,23.021,35220.084,3.18
2020-06-07,Italy,270,72,60461828.0,23.021,35220.084,3.18
2020-06-08,Italy,197,53,60461828.0,23.021,35220.084,3.18
2020-06-09,Italy,280,65,60461828.0,23.021,35220.084,3.18


In [93]:
italy_cum.index = (
    italy[
        (italy.index >= pd.to_datetime("2020-02-28"))
        & (italy.index <= pd.to_datetime("2020-03-20"))
    ]
    .reset_index()["date"]
    .astype(str)
    .apply(lambda x: int(datetime.strptime(x, "%Y-%m-%d").timestamp()))
)


In [103]:
italy_cum.index = list(range(22))


In [117]:
italy_cum


0       650
1       888
2      1128
3      1689
4      2036
5      2502
6      3089
7      3858
8      4636
9      5883
10     7375
11     9172
12    10149
13    12462
14    15113
15    17660
16    21157
17    23980
18    27980
19    31506
20    35713
21    41035
Name: new_cases, dtype: int64

In [119]:
41035 - expon(1610.0, 0.155, 21)


-695.5793997384317

In [113]:
curve_fit(lambda t, a, b: a * np.exp(b * t), italy_cum.index, italy_cum.tolist())[0]


array([1.61382416e+03, 1.55878443e-01])

In [124]:
covid


Unnamed: 0,location,date,new_cases,new_deaths,population,aged_65_older_percent,gdp_per_capita,hospital_beds_per_thousand
0,Afghanistan,2019-12-31,0,0,38928341.0,2.581,1803.987,0.5
1,Afghanistan,2020-01-01,0,0,38928341.0,2.581,1803.987,0.5
2,Afghanistan,2020-01-02,0,0,38928341.0,2.581,1803.987,0.5
3,Afghanistan,2020-01-03,0,0,38928341.0,2.581,1803.987,0.5
4,Afghanistan,2020-01-04,0,0,38928341.0,2.581,1803.987,0.5
...,...,...,...,...,...,...,...,...
23077,International,2020-02-28,0,0,,,,
23078,International,2020-02-29,0,2,,,,
23079,International,2020-03-01,0,0,,,,
23080,International,2020-03-02,0,0,,,,


In [140]:
c = covid.groupby("location").sum()[["new_deaths"]]


In [144]:
c = c.reset_index().sort_values("new_deaths")


In [162]:
covid[covid["location"] == c.iloc[0]["location"]]["population"].unique()[0]


26221.0

In [171]:
vals = {}
for i in range(len(c)):
    location = c.iloc[i]["location"]
    pop = covid[covid["location"] == location]["population"].unique()[0] / 1000000
    vals[location] = c.iloc[i]["new_deaths"] / pop


In [173]:
dict(sorted(vals.items(), key=lambda item: item[1]))


{'Bonaire Sint Eustatius and Saba': 0.0,
 'Timor': 0.0,
 'French Polynesia': 0.0,
 'Namibia': 0.0,
 'Fiji': 0.0,
 'Cambodia': 0.0,
 'Falkland Islands': 0.0,
 'Bhutan': 0.0,
 'Faeroe Islands': 0.0,
 'Eritrea': 0.0,
 'Papua New Guinea': 0.0,
 'Dominica': 0.0,
 'Saint Kitts and Nevis': 0.0,
 'Saint Lucia': 0.0,
 'Seychelles': 0.0,
 'New Caledonia': 0.0,
 'Gibraltar': 0.0,
 'Saint Vincent and the Grenadines': 0.0,
 'Vietnam': 0.0,
 'Anguilla': 0.0,
 'Grenada': 0.0,
 'Lesotho': 0.0,
 'Mongolia': 0.0,
 'Greenland': 0.0,
 'Uganda': 0.0,
 'Vatican': 0.0,
 'Laos': 0.0,
 'Hong Kong': 0.0,
 'Mozambique': 0.06398887105554603,
 'Burundi': 0.08409876525351867,
 'Myanmar': 0.11027426422529739,
 'Angola': 0.1217053302188128,
 'Rwanda': 0.15441381466281157,
 'Malawi': 0.20909615312738583,
 'Zimbabwe': 0.26912599382342384,
 'Ethiopia': 0.2783490142265312,
 'Taiwan': 0.2939104895604044,
 'Madagascar': 0.325015124939967,
 'Benin': 0.3299459433063784,
 'Syria': 0.34284427150363556,
 'Tanzania': 0.351557322

In [174]:
eld = covid[covid["aged_65_older_percent"] >= 20]
eld


Unnamed: 0,location,date,new_cases,new_deaths,population,aged_65_older_percent,gdp_per_capita,hospital_beds_per_thousand
3293,Bulgaria,2020-03-08,2,0,6948445.0,20.801,18563.307,7.454
3294,Bulgaria,2020-03-09,2,0,6948445.0,20.801,18563.307,7.454
3295,Bulgaria,2020-03-12,3,1,6948445.0,20.801,18563.307,7.454
3296,Bulgaria,2020-03-13,16,0,6948445.0,20.801,18563.307,7.454
3297,Bulgaria,2020-03-14,8,0,6948445.0,20.801,18563.307,7.454
...,...,...,...,...,...,...,...,...
16933,Portugal,2020-06-06,377,10,10196707.0,21.502,27936.896,3.390
16934,Portugal,2020-06-07,382,9,10196707.0,21.502,27936.896,3.390
16935,Portugal,2020-06-08,142,5,10196707.0,21.502,27936.896,3.390
16936,Portugal,2020-06-09,392,6,10196707.0,21.502,27936.896,3.390


In [175]:
c = eld.groupby("location").sum()[["new_deaths"]]
c = c.reset_index().sort_values("new_deaths")
vals_eld = {}
for i in range(len(c)):
    location = c.iloc[i]["location"]
    pop = covid[covid["location"] == location]["population"].unique()[0] / 1000000
    vals_eld[location] = c.iloc[i]["new_deaths"] / pop


In [177]:
eld.location.unique()


array(['Bulgaria', 'Finland', 'Germany', 'Greece', 'Italy', 'Japan',
       'Portugal'], dtype=object)

In [176]:
vals_eld


{'Bulgaria': 24.034154404330753,
 'Greece': 17.55723081599101,
 'Finland': 58.47617583136337,
 'Japan': 7.266174389545286,
 'Portugal': 146.32174877634515,
 'Germany': 104.18463823826868,
 'Italy': 563.0494665162952}

$$p(gdp|nbeds) = \frac{p(nbeds|gdp)*p(gdp)}{p(nbeds)}$$

In [182]:
p_gdp_over10000 = len(
    covid[covid["gdp_per_capita"] >= 10000]["location"].unique()
) / len(covid["location"].unique())
p_nbeds_more5 = len(
    covid[covid["hospital_beds_per_thousand"] > 5]["location"].unique()
) / len(covid["location"].unique())


In [198]:
p_nbeds_more5 = len(covid[covid["hospital_beds_per_thousand"] > 5]["location"])/len(covid['location'])

In [201]:
np.sqrt(1000000)

1000.0

In [199]:
p_intersect = (len(
    covid.query("gdp_per_capita>10000 and hospital_beds_per_thousand>=5")[
        "location"
    ]
) / len(covid["location"]))/p_nbeds_more5


In [1]:
import tensorflow as tf

In [4]:
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(100, activation="relu", input_shape=(10,)))
model.add(layers.Dense(100, activation="relu"))
model.add(layers.Dense(10, activation="softmax"))
model.summary()


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 100)               1100      
                                                                 
 dense_6 (Dense)             (None, 100)               10100     
                                                                 
 dense_7 (Dense)             (None, 10)                1010      
                                                                 
Total params: 12,210
Trainable params: 12,210
Non-trainable params: 0
_________________________________________________________________


In [5]:
p_qual = 0.01
p_passed_qualif = 0.9
p_passed = 0.01*0.9+0.99*0.1
p_qual_passed = p_passed_qualif*p_qual/p_passed

In [196]:
covid.query("gdp_per_capita>10000 and hospital_beds_per_thousand>=5")[
        "location"
    ].unique()

array(['Argentina', 'Austria', 'Barbados', 'Belarus', 'Belgium',
       'Bulgaria', 'Croatia', 'Czech Republic', 'France', 'Gabon',
       'Germany', 'Hungary', 'Japan', 'Kazakhstan', 'Latvia', 'Lithuania',
       'Mongolia', 'Poland', 'Romania', 'Russia', 'Serbia', 'Slovakia',
       'South Korea'], dtype=object)

In [193]:
p = (p_intersect*p_gdp_over10000)/p_nbeds_more5

In [194]:
p

3.0803571428571432

In [6]:
ops = input().split()

In [7]:
def calculate_score(ops):
    numbers = []
    for el in ops:
        try:
            numbers.append(int(el))
        except:
            if el == 'C':
                numbers.pop(-1)
            elif el == 'D':
                numbers.append(2*numbers[-1])
            elif el == '+':
                numbers.append(numbers[-1]+numbers[-2]) 
    return sum(numbers)

In [8]:
calculate_score(ops)

30