# Two-Sample Z test for a coronavirus dataset.

In [17]:
import numpy as np
import pandas as pd
import scipy as sc

In [1]:
import os
os.getcwd()

'C:\\Users\\yatin.chadha'

In [4]:
df1 = pd.read_csv("Corona_Updated.csv")
df1.head()

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude,Temprature,Humidity,Temp_Cat,Humid_Cat
0,Hubei,Mainland China,2020-03-10T15:13:05,67760,3024,47743,30.9756,112.2707,12.5,86,1,1
1,,Italy,2020-03-10T17:53:02,10149,631,724,43.0,12.0,12.9,64,1,1
2,,Iran (Islamic Republic of),2020-03-10T19:13:20,8042,291,2731,32.0,53.0,11.9,9,0,0
3,,Republic of Korea,2020-03-10T19:13:20,7513,54,247,36.0,128.0,4.9,41,0,0
4,,France,2020-03-10T18:53:02,1784,33,12,47.0,2.0,11.9,93,0,0


In [13]:
df1.Temp_Cat.value_counts()

0    109
1     97
Name: Temp_Cat, dtype: int64

In [5]:
df1.columns

Index(['Province/State', 'Country/Region', 'Last Update', 'Confirmed',
       'Deaths', 'Recovered', 'Latitude', 'Longitude', 'Temprature',
       'Humidity', 'Temp_Cat', 'Humid_Cat'],
      dtype='object')

In [6]:
data = df1[['Province/State','Country/Region', 'Last Update', 'Confirmed',
       'Deaths', 'Recovered', 'Latitude', 'Longitude', 'Temprature',
       'Humidity']]
data.head()

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude,Temprature,Humidity
0,Hubei,Mainland China,2020-03-10T15:13:05,67760,3024,47743,30.9756,112.2707,12.5,86
1,,Italy,2020-03-10T17:53:02,10149,631,724,43.0,12.0,12.9,64
2,,Iran (Islamic Republic of),2020-03-10T19:13:20,8042,291,2731,32.0,53.0,11.9,9
3,,Republic of Korea,2020-03-10T19:13:20,7513,54,247,36.0,128.0,4.9,41
4,,France,2020-03-10T18:53:02,1784,33,12,47.0,2.0,11.9,93


In [7]:
data['Temp_Cat'] = data['Temprature'].apply(lambda x : 0 if x<24 else 1)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude,Temprature,Humidity,Temp_Cat
0,Hubei,Mainland China,2020-03-10T15:13:05,67760,3024,47743,30.9756,112.2707,12.5,86,0
1,,Italy,2020-03-10T17:53:02,10149,631,724,43.0,12.0,12.9,64,0
2,,Iran (Islamic Republic of),2020-03-10T19:13:20,8042,291,2731,32.0,53.0,11.9,9,0
3,,Republic of Korea,2020-03-10T19:13:20,7513,54,247,36.0,128.0,4.9,41,0
4,,France,2020-03-10T18:53:02,1784,33,12,47.0,2.0,11.9,93,0


In [14]:
data.Temp_Cat.value_counts()

0    175
1     31
Name: Temp_Cat, dtype: int64

In [16]:
corona_t = data[['Confirmed','Temp_Cat']]
corona_t.head()

Unnamed: 0,Confirmed,Temp_Cat
0,67760,0
1,10149,0
2,8042,0
3,7513,0
4,1784,0


##### Since, in the beginning we have made the assumption that Temp > 24 = Hot Climate and Temp < 24 = Cold Climate. Now, 

H0(NULL Hypothesis) => Diff. b/w Hot and Cold Climate = 0  
H1(Alternative Hypothesis) => Diff. b/w Hot and Cold Climate != 0  

#### So, its Two TAIL TEST 

In [49]:
def TwoSampleZ_Test(x1,x2,n1,n2,sigma1,sigma2):
    from numpy import sqrt, abs, round
    from scipy.stats import norm
    
    Dr = sqrt((sigma1**2/n1)+(sigma2**2/n2))
    z = (x1-x2)/Dr
    pvalue = 2*(1 - norm.cdf(np.abs(z))) 
    return pvalue,z

In [51]:
d1 = corona_t[corona_t['Temp_Cat']==1]['Confirmed']
d2 = corona_t[corona_t['Temp_Cat']==0]['Confirmed']

In [52]:
m1,m2 = d1.mean() , d2.mean()
std1, std2 = d1.std(), d2.std()
n1,n2 = d1.shape[0] , d2.shape[0] 

In [53]:
p , z = TwoSampleZ_Test(m1,m2,n1,n2,std1,std2)

p_val = np.round(p,8)
z_score = np.round(z,8)

In [54]:
if (p_val<0.05):
    Hypothesis_Status = 'Reject Null Hypothesis : Significant'
else:
    Hypothesis_Status = 'Do not reject Null Hypothesis : Not Significant'

print (p_val)
print (Hypothesis_Status)

0.10205422
Do not reject Null Hypothesis : Not Significant


In [55]:
z_score

-1.63497531