In [7]:
!pip install scipy



In [8]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm
import scipy.stats


#Load csv
file_path = ('/content/houston-aqi-2010-2021.csv')
df = pd.read_csv (file_path)

#Printing the csv file will help us to check the column names
print (df.head())

   Unnamed: 0  day_of_year  year   latitude  longitude  avg_pm10  aqi_pm10  \
0           0            2  2010  29.733726 -95.257593        13        12   
1           1            2  2010  29.733726 -95.257593        13        12   
2           2            2  2010  29.733726 -95.257593        13        12   
3           3            2  2010  29.733726 -95.257593        13        12   
4           4            2  2010  29.733726 -95.257593        13        12   

     avg_co  aqi_co    avg_no2  ...    avg_o3  aqi_o3  avg_pm25  aqi_pm25  \
0  0.297667     NaN  17.258333  ...  0.027294      32      11.6        48   
1  0.297667     NaN  17.258333  ...  0.027294      32      11.6        48   
2  0.297667     NaN  17.258333  ...  0.027294      32       9.7        40   
3  0.297667     NaN  17.258333  ...  0.027294      32       9.7        40   
4  0.325000     6.0  17.258333  ...  0.027294      32      11.6        48   

    avg_so2  aqi_so2  avg_humidity  avg_temperature  avg_wind  avg_p

In [3]:
print(f"Mean Pressure: {df['avg_pressure'].mean()}")
print(f"Mean Wind: {df['avg_wind'].mean()}")
print(f"Mean Temperature: {df['avg_temperature'].mean()}")
print(f"Mean Humidity:  {df['avg_humidity'].mean()}")

Mean Pressure: 1017.8874957186971
Mean Wind: 5.238881443739425
Mean Temperature: 68.73953821573603
Mean Humidity:  65.6445454107445


In [5]:
n = 100
random_indices = np.random.choice(df.index, n, replace=False)
wind_samples = df.loc[random_indices, 'avg_wind']
wind_samples

2937     4.891667
4495     3.620833
606      4.704167
4348     1.854167
3294     6.295833
          ...    
333      8.604167
2966     5.900000
4406     3.658333
679      5.241667
4161    12.516667
Name: avg_wind, Length: 100, dtype: float64

In [6]:
p = 0.5
q = 1 - p
print(f'np >= 5 ? {len(wind_samples) * p >= 5}')
print(f'nq >= 5 ? {len(wind_samples) * q >= 5}')

np >= 5 ? True
nq >= 5 ? True


In [9]:
alpha = 0.01
w_over_6 = np.where(wind_samples > 6, 1, 0).sum() / 100.
z = (w_over_6 - p) / np.sqrt(p * q / n)
print(f'The z-index is: {z}')
print(f'The p-value is: {scipy.stats.norm.sf(z)}')
print(f'The critical value is: {scipy.stats.norm.ppf(1 - alpha)}')
print(f'Reject null hypothesis? {scipy.stats.norm.sf(z) <= alpha}')

The z-index is: -4.0
The p-value is: 0.9999683287581669
The critical value is: 2.3263478740408408
Reject null hypothesis? False


In [12]:
margin_of_error = scipy.stats.norm.ppf(1 - alpha / 2) * np.sqrt(w_over_6 * (1 - w_over_6) / n)
print(f'The proportion of wind samples greater than 6 unit is: {w_over_6}')
print(f'The confidence interval is: ({round(w_over_6 - margin_of_error, 2)}, {round(w_over_6 + margin_of_error, 2)})')

The proportion of wind samples greater than 6 unit is: 0.3
The confidence interval is: (0.18, 0.42)


In [13]:
alpha = 0.05
degrees_of_freedom = len(wind_samples) - 1
t_statistic, p_value = scipy.stats.ttest_1samp(wind_samples, 6)
critical_value = scipy.stats.t.ppf(1 - alpha / 2, degrees_of_freedom)
print(f'The mean of the wind samples is: {round(wind_samples.mean(), 2)}')
print(f'The t-statistic is: {t_statistic}')
print(f'The p-value is: {p_value}')
print(f'The critical value is: {critical_value}')
print(f'Reject null hypothesis? {p_value <= alpha}')

The mean of the wind samples is: 5.36
The t-statistic is: -2.904456914470339
The p-value is: 0.0045374559792208365
The critical value is: 1.9842169515086827
Reject null hypothesis? True


In [14]:
margin_of_error = critical_value * wind_samples.std() / np.sqrt(n)
print(f'Confidence interval: ({round(wind_samples.mean() - margin_of_error, 2)}, {round(wind_samples.mean() + margin_of_error, 2)})')

Confidence interval: (4.93, 5.8)
