In [1]:
!pip install scipy



In [2]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm
import scipy.stats


#Load csv
file_path = ('/content/houston-aqi-2010-2021.csv')
df = pd.read_csv (file_path)

#Printing the csv file will help us to check the column names
print (df.head())

   Unnamed: 0  day_of_year  year   latitude  longitude  avg_pm10  aqi_pm10  \
0           0            2  2010  29.733726 -95.257593        13        12   
1           1            2  2010  29.733726 -95.257593        13        12   
2           2            2  2010  29.733726 -95.257593        13        12   
3           3            2  2010  29.733726 -95.257593        13        12   
4           4            2  2010  29.733726 -95.257593        13        12   

     avg_co  aqi_co    avg_no2  ...    avg_o3  aqi_o3  avg_pm25  aqi_pm25  \
0  0.297667     NaN  17.258333  ...  0.027294      32      11.6        48   
1  0.297667     NaN  17.258333  ...  0.027294      32      11.6        48   
2  0.297667     NaN  17.258333  ...  0.027294      32       9.7        40   
3  0.297667     NaN  17.258333  ...  0.027294      32       9.7        40   
4  0.325000     6.0  17.258333  ...  0.027294      32      11.6        48   

    avg_so2  aqi_so2  avg_humidity  avg_temperature  avg_wind  avg_p

In [3]:
print(f"Mean Pressure: {df['avg_pressure'].mean()}")
print(f"Mean Wind: {df['avg_wind'].mean()}")
print(f"Mean Temperature: {df['avg_temperature'].mean()}")
print(f"Mean Humidity:  {df['avg_humidity'].mean()}")

Mean Pressure: 1017.8874957186971
Mean Wind: 5.238881443739425
Mean Temperature: 68.73953821573603
Mean Humidity:  65.6445454107445


#Temperature
##We select 100 random samples from our variable Temperature on which we will be performing our tests

In [4]:
n = 100
random_indices = np.random.choice(df.index, n, replace=False)
temperature_samples = df.loc[random_indices, 'avg_temperature']
temperature_samples

830     68.541667
1110    85.625000
1374    91.083333
3805    60.625000
4533    57.458333
          ...    
2670    61.083333
4047    68.458333
665     51.333333
1241    86.541667
576     75.833333
Name: avg_temperature, Length: 100, dtype: float64

#Proportion Test
##We start our proportion test by selecting p = 0.5 for our given temperature variable and see if the conditions are satisfied.

#Hypothesis Test
##We are going to test the claim that greater than 50% of the temperature samples are greater than 57 units with a 99% confidence level by conducting a right-tailed hypothesis test, where the null hypothesis is the statement of equality about the proportion of pressure samples that are greater than 57 units is equal to 50%. We first calculate the test statistic for the z-index using numpy, and find the value of the survival function using scipy.stats.norm.

In [1]:
p = 0.5
q = 1 - p

In [6]:
alpha = 0.01
t_over_57 = np.where(temperature_samples > 57, 1, 0).sum() / 100.
z = (t_over_57 - p) / np.sqrt(p * q / n)
print(f'The z-index is: {z}')
print(f'The p-value is: {scipy.stats.norm.sf(z)}')
print(f'The critical value is: {scipy.stats.norm.ppf(1 - alpha)}')
print(f'Reject null hypothesis? {scipy.stats.norm.sf(z) <= alpha}')

The z-index is: 5.2
The p-value is: 9.96442631693347e-08
The critical value is: 2.3263478740408408
Reject null hypothesis? True


#Confidence Interval
##We calculate the confidence interval for obtaining an estimate over the population proportion we have calculated using the chosen significance level of 0.01.

In [7]:
margin_of_error = scipy.stats.norm.ppf(1 - alpha / 2) * np.sqrt(t_over_57 * (1 - t_over_57) / n)
print(f'The proportion of temperature samples greater than 57 unit is: {t_over_57}')
print(f'The confidence interval is: ({round(t_over_57 - margin_of_error, 2)}, {round(t_over_57 + margin_of_error, 2)})')

The proportion of temperature samples greater than 57 unit is: 0.76
The confidence interval is: (0.65, 0.87)


#Mean Test
##We perform the following mean hypothesis test, where we formulate a hypothesis about the mean of the temperature samples being 57 and test our hypothesis using a standard numerical t-test. We are going to assume that the standard deviation of the population is also not know, as is commonly the case when conducting a test about the population mean interval.

#Hypothesis Test
##We conduct a two-tailed hypothesis test for the sample means being equivalent to 57 units with 95% confidence level.

In [8]:
alpha = 0.05
degrees_of_freedom = len(temperature_samples) - 1
t_statistic, p_value = scipy.stats.ttest_1samp(temperature_samples, 6)
critical_value = scipy.stats.t.ppf(1 - alpha / 2, degrees_of_freedom)
print(f'The mean of the temperature samples is: {round(temperature_samples.mean(), 2)}')
print(f'The t-statistic is: {t_statistic}')
print(f'The p-value is: {p_value}')
print(f'The critical value is: {critical_value}')
print(f'Reject null hypothesis? {p_value <= alpha}')

The mean of the temperature samples is: 68.55
The t-statistic is: 46.77091911196774
The p-value is: 2.613562401173639e-69
The critical value is: 1.9842169515086827
Reject null hypothesis? True


##Since the p-value is greater than the significance level of 0.05, we fail to reject the null hypothesis to conclude that the mean of the temperature samples is above 57 units.

#Confidence Interval
##We construct a confidence interval for a 95% confidence level for the estimation of the population mean.

In [9]:
margin_of_error = critical_value * temperature_samples.std() / np.sqrt(n)
print(f'Confidence interval: ({round(temperature_samples.mean() - margin_of_error, 2)}, {round(temperature_samples.mean() + margin_of_error, 2)})')

Confidence interval: (65.89, 71.2)
