In [2]:
!pip install scipy


Collecting scipy
  Downloading scipy-1.16.3-cp314-cp314-win_amd64.whl.metadata (60 kB)
Downloading scipy-1.16.3-cp314-cp314-win_amd64.whl (39.4 MB)
   ---------------------------------------- 0.0/39.4 MB ? eta -:--:--
    --------------------------------------- 0.5/39.4 MB 6.6 MB/s eta 0:00:06
   -- ------------------------------------- 2.1/39.4 MB 6.8 MB/s eta 0:00:06
   --- ------------------------------------ 3.1/39.4 MB 5.9 MB/s eta 0:00:07
   ---- ----------------------------------- 4.2/39.4 MB 6.1 MB/s eta 0:00:06
   ---- ----------------------------------- 4.7/39.4 MB 4.8 MB/s eta 0:00:08
   ----- ---------------------------------- 5.8/39.4 MB 5.2 MB/s eta 0:00:07
   ------ --------------------------------- 6.6/39.4 MB 4.7 MB/s eta 0:00:07
   ------ --------------------------------- 6.8/39.4 MB 4.5 MB/s eta 0:00:08
   ------- -------------------------------- 7.6/39.4 MB 4.2 MB/s eta 0:00:08
   -------- ------------------------------- 8.1/39.4 MB 3.9 MB/s eta 0:00:08
   -------- 

In [2]:
# 0) Imports 
import pandas as pd
from scipy.stats import ttest_ind

# 1) Dataset laden 
red_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
white_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"

red = pd.read_csv(red_url, sep=";")
white = pd.read_csv(white_url, sep=";")

# 2) Vorbereiten: wine_type hinzufügen & zusammenführen 
red["wine_type"] = "red"
white["wine_type"] = "white"

wine = pd.concat([red, white], ignore_index=True)

# 3) überprüfen
print("Shape:", wine.shape)
print("Columns:", list(wine.columns))
print("\nCounts by type:")
print(wine["wine_type"].value_counts())

# 4) Hypothese (independent two-sample t-test / Welch) 
# H0: mean(alcohol_red) == mean(alcohol_white)
# H1: mean(alcohol_red) != mean(alcohol_white)  (zweiseitig)

alpha = 0.05

# 5) Samples ziehen 
red_alcohol = wine.loc[wine["wine_type"] == "red", "alcohol"]
white_alcohol = wine.loc[wine["wine_type"] == "white", "alcohol"]

# 6) Deskriptive Statistik 
print("\nDescriptive stats (alcohol):")
print(f"Red   n={len(red_alcohol)}  mean={red_alcohol.mean():.4f}  std={red_alcohol.std(ddof=1):.4f}")
print(f"White n={len(white_alcohol)} mean={white_alcohol.mean():.4f}  std={white_alcohol.std(ddof=1):.4f}")

# 7) t-Test durchführen (Welch: equal_var=False)
t_stat, p_value = ttest_ind(red_alcohol, white_alcohol, equal_var=False)

print("\nWelch's independent two-sample t-test (two-sided)")
print(f"t-statistic = {t_stat:.4f}")
print(f"p-value     = {p_value:.6f}")

#  8) Entscheidung + Interpretation 
print("\nDecision (alpha = 0.05):")
if p_value < alpha:
    print("Reject H0: There IS a statistically significant difference in mean alcohol content.")
else:
    print("Fail to reject H0: No statistically significant difference in mean alcohol content found.")

#  9)Praktische Interpretation (wer hat mehr im Mittel?)
diff = white_alcohol.mean() - red_alcohol.mean()
print("\nMean difference (white - red):")
print(f"{diff:.4f} alcohol-%")

if diff > 0:
    print("Interpretation: White wine has a higher mean alcohol content (in this dataset).")
elif diff < 0:
    print("Interpretation: Red wine has a higher mean alcohol content (in this dataset).")
else:
    print("Interpretation: Means are identical (very unlikely with real data).")


Shape: (6497, 13)
Columns: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality', 'wine_type']

Counts by type:
wine_type
white    4898
red      1599
Name: count, dtype: int64

Descriptive stats (alcohol):
Red   n=1599  mean=10.4230  std=1.0657
White n=4898 mean=10.5143  std=1.2306

Welch's independent two-sample t-test (two-sided)
t-statistic = -2.8590
p-value     = 0.004278

Decision (alpha = 0.05):
Reject H0: There IS a statistically significant difference in mean alcohol content.

Mean difference (white - red):
0.0913 alcohol-%
Interpretation: White wine has a higher mean alcohol content (in this dataset).
