In [10]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_1samp, ttest_ind, f_oneway, chi2_contingency
import seaborn as sns
import matplotlib.pyplot as plt

# Load data
data = pd.read_csv("C:/Users/borun/Desktop/iris.csv")

# Initial checks
print(data.head())
print(data.isnull().sum())
print("Unique species:", data['species'].unique())

# Separate species data
setosa = data[data['species'] == 'Iris-setosa']['sepal_length']
versicolor = data[data['species'] == 'Iris-versicolor']['sepal_length']
virginica = data[data['species'] == 'Iris-virginica']['sepal_length']

# Debugging sample sizes
print(f"Setosa count: {len(setosa)}")
print(f"Versicolor count: {len(versicolor)}")
print(f"Virginica count: {len(virginica)}")

# Shape of the dataset
print("Dataset shape:", data.shape)

# One-sample t-test for Setosa sepal length against population mean of 5
setosa_mean = np.mean(setosa)
print("Setosa mean Sepal Length:", setosa_mean)
if len(setosa) >= 5:
    ttest, p_value = ttest_1samp(setosa, 5)
    print("One-sample t-test p-value:", p_value)
else:
    print("Not enough data for one-sample t-test.")

# Independent two-sample t-test between Setosa and Versicolor
if len(setosa) >= 5 and len(versicolor) >= 5:
    t_stat, p_value = ttest_ind(setosa, versicolor)
    print(f'T-statistic: {t_stat:.2f}')
    print(f'P-value: {p_value:.4f}')
    if p_value < 0.05:
        print("There is a significant difference in sepal lengths between Iris-setosa and Iris-versicolor.")
    else:
        print("There is no significant difference in sepal lengths between Iris-setosa and Iris-versicolor.")
else:
    print("Not enough data for independent t-test.")

# One-way ANOVA for comparing the means of three species
if len(setosa) >= 5 and len(versicolor) >= 5 and len(virginica) >= 5:
    f_stat, p_value = f_oneway(setosa, versicolor, virginica)
    print(f'F-statistic: {f_stat:.2f}')
    print(f'P-value: {p_value:.4f}')
else:
    print("Not enough data for one-way ANOVA.")

# Contingency table for Sepal Width Categories
def categorize_sepal_width(width):
    if width < 2.5:
        return 'Narrow'
    elif 2.5 <= width <= 3.5:
        return 'Medium'
    else:
        return 'Wide'

data['SepalWidthCategory'] = data['sepal_width'].apply(categorize_sepal_width)

# Create the contingency table
contingency_table = pd.crosstab(data['species'], data['SepalWidthCategory'])
print("Contingency Table:\n", contingency_table)

# Chi-square test for independence
chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)
print("Chi-Square Statistic:", chi2_stat)
print("P-Value:", p_value)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:\n", expected)

# Interpretation of the Chi-Square test
if p_value < 0.05:
    print("Reject the null hypothesis: There is a significant relationship between species and sepal width categories.")
else:
    print("Fail to reject the null hypothesis: There is no significant relationship between species and sepal width categories.")


   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa
sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64
Unique species: ['setosa' 'versicolor' 'virginica']
Setosa count: 0
Versicolor count: 0
Virginica count: 0
Dataset shape: (150, 5)
Setosa mean Sepal Length: nan
Not enough data for one-sample t-test.
Not enough data for independent t-test.
Not enough data for one-way ANOVA.
Contingency Table:
 SepalWidthCategory  Medium  Narrow  Wide
species                                 
setosa                  34       1    15
versicolor              41       9     0
virginica               46       1     3
Chi-Square St