In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [73]:
cardio_alcohol = pd.read_csv('./cardio_alco.csv',sep=';')
cardio_base = pd.read_csv('./cardio_base.csv')
covid_data = pd.read_csv('./covid_data.csv')

In [41]:
cardio_alcohol.head()
cardio_alcohol.loc[cardio_alcohol['alco']==1]

Unnamed: 0,id,alco
30,81,1
67,134,1
69,136,1
75,142,1
102,180,1
...,...,...
56845,99858,1
56851,99864,1
56861,99882,1
56886,99933,1


In [10]:
cardio_base.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,smoke
0,0,18393,2,168,62.0,110,80,1,0
1,1,20228,1,156,85.0,140,90,3,0
2,2,18857,1,165,64.0,130,70,3,0
3,3,17623,2,169,82.0,150,100,1,0
4,4,17474,1,156,56.0,100,60,1,0


In [14]:
print(cardio_base['height'].quantile(0.1, interpolation='midpoint'))

155.0


In [16]:
cardio_base.corr(method='spearman')

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,smoke
id,1.0,0.002682,0.003506,-0.001886,-0.001292,0.003258,-0.000238,0.005706,-0.003697
age,0.002682,1.0,-0.020176,-0.082292,0.061559,0.219087,0.1567,0.140113,-0.047471
gender,0.003506,-0.020176,1.0,0.533805,0.17248,0.063174,0.066374,-0.03573,0.338135
height,-0.001886,-0.082292,0.533805,1.0,0.313569,0.019381,0.030534,-0.05894,0.197632
weight,-0.001292,0.061559,0.17248,0.313569,1.0,0.277749,0.25442,0.137908,0.071676
ap_hi,0.003258,0.219087,0.063174,0.019381,0.277749,1.0,0.735436,0.208722,0.028929
ap_lo,-0.000238,0.1567,0.066374,0.030534,0.25442,0.735436,1.0,0.167401,0.026172
cholesterol,0.005706,0.140113,-0.03573,-0.05894,0.137908,0.208722,0.167401,1.0,0.01522
smoke,-0.003697,-0.047471,0.338135,0.197632,0.071676,0.028929,0.026172,0.01522,1.0


In [19]:
mean_height = cardio_base['height'].mean()
std_dev_height = cardio_base['height'].std()
upper_bound = mean_height + 2 * std_dev_height
lower_bound = mean_height - 2 * std_dev_height

In [20]:
outliers = cardio_base[(cardio_base['height'] > upper_bound) | (cardio_base['height'] < lower_bound)]


In [22]:
percentage_outliers = (len(outliers) / len(cardio_base)) * 100


In [23]:
print(percentage_outliers)

3.3357142857142854


In [57]:
merged_df = pd.merge(cardio_base, cardio_alcohol,on='id',how='left')
merged_df.head()


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,smoke,alco
0,0,18393,2,168,62.0,110,80,1,0,
1,1,20228,1,156,85.0,140,90,3,0,
2,2,18857,1,165,64.0,130,70,3,0,
3,3,17623,2,169,82.0,150,100,1,0,
4,4,17474,1,156,56.0,100,60,1,0,


In [69]:
total_50_age = len(merged_df.loc[(merged_df['age'] > 50) & ~(merged_df['alco'].isna())])
total_50_alco = len(merged_df.loc[(merged_df['age'] > 50) & (merged_df['alco'] == 1)])


In [70]:
print(total_50_age/total_50_alco)

18.711936862874055


In [58]:
merged_df.loc[merged_df['alco'] == 1]

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,smoke,alco
62,81,20330,2,187,115.0,130,90,1,0,1.0
99,134,17363,1,167,71.0,120,80,2,0,1.0
101,136,18718,1,167,80.0,190,90,2,0,1.0
107,142,14499,2,164,48.0,110,70,1,1,1.0
134,180,21473,2,174,90.0,140,80,1,1,1.0
...,...,...,...,...,...,...,...,...,...,...
69909,99858,19475,2,171,65.0,120,80,1,1,1.0
69915,99864,21883,2,178,102.0,150,100,1,0,1.0
69925,99882,23281,1,164,98.0,140,90,2,0,1.0
69950,99933,19137,2,167,74.0,130,80,1,1,1.0


In [72]:
from scipy import stats
smoker_weight = cardio_base.loc[cardio_base['smoke'] == 1]['weight']
non_smoker_weight = cardio_base.loc[cardio_base['smoke'] == 0]['weight']
t_statistic, p_value = stats.ttest_ind(smoker_weight, non_smoker_weight)
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis. Smokers weigh less than non-smokers with 95% confidence.")
else:
    print("Fail to reject the null hypothesis. There is not enough evidence to suggest that smokers weigh less than non-smokers.")



Reject the null hypothesis. Smokers weigh less than non-smokers with 95% confidence.


In [75]:
covid_data.head(20)

Unnamed: 0,location,date,new_cases,new_deaths,population,aged_65_older_percent,gdp_per_capita,hospital_beds_per_thousand
0,Afghanistan,2019-12-31,0,0,38928341.0,2.581,1803.987,0.5
1,Afghanistan,2020-01-01,0,0,38928341.0,2.581,1803.987,0.5
2,Afghanistan,2020-01-02,0,0,38928341.0,2.581,1803.987,0.5
3,Afghanistan,2020-01-03,0,0,38928341.0,2.581,1803.987,0.5
4,Afghanistan,2020-01-04,0,0,38928341.0,2.581,1803.987,0.5
5,Afghanistan,2020-01-05,0,0,38928341.0,2.581,1803.987,0.5
6,Afghanistan,2020-01-06,0,0,38928341.0,2.581,1803.987,0.5
7,Afghanistan,2020-01-07,0,0,38928341.0,2.581,1803.987,0.5
8,Afghanistan,2020-01-08,0,0,38928341.0,2.581,1803.987,0.5
9,Afghanistan,2020-01-09,0,0,38928341.0,2.581,1803.987,0.5


In [76]:
import pandas as pd

# Assuming you have imported or defined covid_data DataFrame
# Make sure covid_data has columns like 'location', 'date', and 'new_cases'

# Filter data for Italy and Germany
italy_data = covid_data[covid_data['location'] == 'Italy']
germany_data = covid_data[covid_data['location'] == 'Germany']

# Iterate through dates
for date in sorted(set(italy_data['date']).intersection(set(germany_data['date']))):
    # Calculate total cases for Italy and Germany up to the current date
    italy_cases = italy_data[italy_data['date'] <= date]['new_cases'].sum()
    germany_cases = germany_data[germany_data['date'] <= date]['new_cases'].sum()
    
    # Calculate the difference
    diff = abs(italy_cases - germany_cases)
    
    # Check if difference exceeds 10000
    if diff > 10000:
        print(f"The difference in total cases between Italy and Germany exceeded 10000 on {date}.")
        break


The difference in total cases between Italy and Germany exceeded 10000 on 2020-03-12.


In [78]:
from scipy.optimize import curve_fit
covid_data['total_cases'] = covid_data['new_cases'].cumsum()
filtered_data = covid_data[(covid_data['date'] >= '2020-02-28') & (covid_data['date'] <= '2020-03-20')]
filtered_data['date'] = pd.to_datetime(filtered_data['date'])
filtered_data['days_since_start'] = (filtered_data['date'] - pd.Timestamp('2020-02-28')).dt.days
def exponential_func(x, a, b):
    return a * np.exp(b * x)
popt, pcov = curve_fit(exponential_func, filtered_data['days_since_start'], filtered_data['total_cases'])
exponential_curve = exponential_func(filtered_data['days_since_start'], *popt)
march_20_index = filtered_data.index[filtered_data['date'] == '2020-03-20'][0]
difference_march_20 = abs(filtered_data.at[march_20_index, 'total_cases'] - exponential_curve[march_20_index])
print("Difference between exponential curve and real cumulative cases on March 20, 2020:", difference_march_20)


Difference between exponential curve and real cumulative cases on March 20, 2020: 2629704.2245286778


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['date'] = pd.to_datetime(filtered_data['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['days_since_start'] = (filtered_data['date'] - pd.Timestamp('2020-02-28')).dt.days


In [88]:
county_total_deaths = covid_data.groupby('location')['new_deaths'].sum().reset_index()
print(county_total_deaths.head())

      location  new_deaths
0  Afghanistan         384
1      Albania          34
2      Algeria         724
3      Andorra          51
4       Angola           4


In [90]:
county_total_deaths['population'] = covid_data.groupby('location')['population'].first().reset_index()

ValueError: Cannot set a DataFrame with multiple columns to the single column population

In [102]:
# Assuming 'county_total_deaths' is your DataFrame containing total deaths and population for each county

# Select rows where the location is 'United States'
us_data = county_total_deaths[county_total_deaths['location'] == 'Italy']

# Print the selected data
print(us_data)

   location  new_deaths
98    Italy       34043


In [13]:
def calPoints(ops):
    prev = []
    ops = ops[0]
    for i in range(len(ops)):
        if ops[i].isdigit():
            prev.append(int(ops[i]))
        if ops[i]=='C':
            prev.pop(-1)
        if ops[i]=='D':
            prev.append(int(2*prev[-1]))
        if ops[i]=='+':
            prev.append(int(prev[-1]+prev[-2]))
    result = sum(prev)
    print(result)
        

if __name__ == '__main__':
    line = '5 2 C D +'
    ops = line.strip().split()
    print(ops)
    calPoints(ops)

['5', '2', 'C', 'D', '+']
5


In [None]:
def calPoints(ops):
    prev = []
    ops = ops[0]
    for i in range(len(ops)):
        if ops[i].isdigit():
            prev.append(int(ops[i]))
        if ops[i]=='C':
            prev.pop(-1)
        if ops[i]=='D':
            prev.append(int(2*int(prev[-1])))
        if ops[i]=='+':
            prev.append(int(int(prev[-1])+int(prev[-2])))
    result = sum(prev)