In [2]:
import pandas as pd #import the pandas library for data manipulation

df = pd.read_csv('wetter.csv') #pd.read_csv() reads the CSV file 'wetter.csv' and stores it in a DataFrame called df
print(df.head())

#Calculate the overall average temperature.
avg_temp = df['Temperatur'].mean() #.mean() calculates the average of the 'Temperatur' column
print('Overall Average Temperature:', round(avg_temp, 2), '°C')

#Calculate the average temperature for the month of July.
df["Datum"] = pd.to_datetime(df["Datum"])
july_data = df[df['Datum'].dt.month == 7] #dt.month extracts the month from the datetime, == 7 filters for July
july_avg_temp = july_data['Temperatur'].dropna().mean() #dropna() removes any NaN values before calculating the mean, .mean() calculates the average
print(f'Average Temperature in July:', round(july_avg_temp, 2), '°C')
print(f'\nNumber of July-dates: {len(july_data)}')
print(f'\nYearly Average:')
print(july_data.groupby(july_data['Datum'].dt.year)['Temperatur'].mean().round(2))

#Compare whether the months of July and May differ significantly in their average temperature.
from scipy.stats import ttest_ind #import the t-test function from scipy.stats

df = pd.read_csv('wetter.csv') #read the CSV file again to ensure a fresh DataFrame
df["Datum"] = pd.to_datetime(df["Datum"])

July = df[df['Datum'].dt.month == 7]['Temperatur'].dropna() #filter for July and drop NaN values
May = df[df['Datum'].dt.month == 5]['Temperatur'].dropna() #filter for May and drop NaN values

print("Average July Temperature:", round(July.mean(), 2), "°C")
print("Average May Temperature:", round(May.mean(), 2), "°C")

t_stat, p_value = ttest_ind(July, May, equal_var=False) #perform the t-test assuming unequal variances, t-test measures if the means of two groups are statistically different (strength of the difference), p_value shows the significance level (if the difference is only coincidence)

print("T-statistic:", round(t_stat, 4))
print("P-value:", round(p_value, 4))

if p_value < 0.05: #if the p-value is less than 0.05, we reject the null hypothesis
    print("The difference in average temperatures between July and May is statistically significant.")
else:
    print("The difference in average temperatures between July and May is not statistically significant.")

print(f"P-value (exakt): {p_value:.20f}")



        Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode
0  2012-01-01         8.0      9.8250                   14        58.0
1  2012-01-02         7.0      7.4375                   12         NaN
2  2012-01-03         8.0      5.5375                   18        63.0
3  2012-01-04         4.0      5.6875                   19        80.0
4  2012-01-05         6.0      5.3000                   23        80.0
Overall Average Temperature: 12.1 °C
Average Temperature in July: 20.75 °C

Number of July-dates: 247

Yearly Average:
Datum
2012    18.63
2013    21.90
2014    22.97
2015    19.65
2016    20.16
2017    19.11
2018    23.67
2019    19.85
Name: Temperatur, dtype: float64
Average July Temperature: 20.75 °C
Average May Temperature: 15.24 °C
T-statistic: 15.94
P-value: 0.0
The difference in average temperatures between July and May is statistically significant.
P-value (exakt): 0.00000000000000000000
