In [None]:
"""BIU002 Beer data, Reports & Basic Statistics.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1DZL6FaEDvgHdLjMZIhTRD3A8ClCqjtLu

# EDA using reporting libraries
"""

In [None]:
import matplotlib as plt
import pandas as pd
import numpy as np
import seaborn as sb
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv(r"C:\Users\ophir\OneDrive\Desktop\beer_consumption.csv")
df

In [None]:
df.shape

In [None]:
df.info()

In [None]:
"""# Data Protocol"""

In [None]:
# Type of valaues
df.dtypes.to_excel("beer_datatype.xlsx",
             sheet_name='data_type')
# Maximum valaues
df.max().to_excel("max_beer.xlsx",
             sheet_name='max')
# Minimum Values
df.min().to_excel("min_beer.xlsx",
             sheet_name='min')
# Missing Values
df.isnull().sum(axis=0).to_excel("NA_beer.xlsx",
             sheet_name='NA')
# exporting results to the protocol
df.nunique().to_excel("unique_beer.xlsx",
             sheet_name='unique')

In [None]:
"""# Descriptive Statistics

Automated reports

### AutoViz
"""

In [None]:
# Commented out IPython magic to ensure Python compatibility.
!pip install autoviz
from autoviz.AutoViz_Class import AutoViz_Class
%matplotlib inline

In [None]:
AV = AutoViz_Class()
AV.AutoViz('C:/Users/ophir/Desktop/beer_consumption.csv')

In [None]:
"""# EDA - Explenatory Data Analysis"""

Descriptive Statistics

In [None]:
df.describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sb

In [None]:
"""# Target Value

Categorial 2 classes
"""

In [None]:
sb.countplot(x='Weekend', data=df)

In [None]:
avg_regular_day_consumption = df[df['Weekend'] == 0]['Consumtion_liters'].mean()
avg_weekend_day_consumption = df[df['Weekend'] == 1]['Consumtion_liters'].mean()

In [None]:
print(f"Average Beer Consumption on Regular Days: {avg_regular_day_consumption}")
print(f"Average Beer Consumption on Weekend Days: {avg_weekend_day_consumption}")

In [None]:
"""# Categorial Data"""

In [None]:
df['month'] = df['month'].astype('category')
df['day'] = df['day'].astype('category')
#df['Season'] = df['Season'].astype('category')

In [None]:
# Filter out the categorical columns
categorical_df = df.select_dtypes(include=['category'])

In [None]:
plot_categorical = lambda col: sns.countplot(x=col, data=categorical_df)

sb.countplot(x='month', data=categorical_df)

In [None]:
a = 1  # number of rows
b = 2  # number of columns
c = 1  # plot counter

In [None]:
fig = plt.figure(figsize=(25,10))
plt.subplots_adjust(hspace = 0.5)
sb.set(font_scale = 2.5)
for i in categorical_df:

        plt.subplot(a, b, c)
        plt.title('{}'.format(i))
        plt.xlabel(i)
        sb.countplot(categorical_df[i])
        c = c + 1

In [None]:
plt.show()

In [None]:
"""# Histograms for continues numbers"""

In [None]:
cols= ['Temp_median','Temp_min','Temp_max','Rain','Consumtion_liters']
con_df = df[cols]

In [None]:
con_df.hist(figsize=(35, 35), bins=35, xlabelsize=8, ylabelsize=8, color = "navy");

In [None]:
"""# Skewness"""

In [None]:
def highlight(cell_value):
    highlight = 'background-color: mediumspringgreen;'
    default = ''
    negative = 'background-color: hotpink;'
    if cell_value > 1:
        return highlight
    elif cell_value < -1:
        return negative
    #else
       # return default
pd.DataFrame(con_df.skew(),columns=['skewness']).sort_values(by='skewness', ascending=False).style.applymap(highlight)

In [None]:
"""Most continus data are normaly distributed

# Correlation
"""

The Pearson correlation measures the linear relationship between two continuous variables

In [None]:
df.corr()

In [None]:
sb.heatmap(df.corr())

In [None]:
df.corr(method='spearman')

In [None]:
"""# t-test"""

In [None]:
df

In [None]:
!pip install scipy
import scipy.stats as stats
import seaborn as sns

In [None]:
#Separate the consumption data by weekends and weekdays
regular_day_consumption = df[df['Weekend'] == 0]['Consumtion_liters']
weekend_day_consumption = df[df['Weekend'] == 1]['Consumtion_liters']
# Perform an independent samples t-test
t_stat, p_value = stats.ttest_ind(weekend_day_consumption, regular_day_consumption)

In [None]:
# Print the results
print(f'T-statistic: {t_stat}')
print(f'P-value: {p_value}')

In [None]:
# Plotting the distribution of water consumption for weekends and weekdays
plt.figure(figsize=(20, 6))
sns.histplot(regular_day_consumption, kde=True, color='blue', label='Regular Day Consumption', alpha=0.5)
sns.histplot(weekend_day_consumption, kde=True, color='green', label='Weekend Consumption', alpha=0.5)
plt.title('Distribution of Beer Consumption by Regular Days and Weekends')
plt.xlabel('Beer Consumption (liters)')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
"""# Anova"""

F-statistic = Variance Between the groups/Variance within the groups

In [None]:
# Set the random seed for reproducibility
np.random.seed(42)
group1 = np.random.normal(loc=50, scale=5, size=100)
group2 = np.random.normal(loc=60, scale=5, size=100)
group3 = np.random.normal(loc=70, scale=5, size=100)

In [None]:
# performe ANOVA
f_statistic, p_value = stats.f_oneway(group1, group2, group3)

In [None]:
print(f"F-statistic: {f_statistic}")
print(f"p-value: {p_value}")

In [None]:
if p_value < 0.05:
    print("There is a significant difference between the group means")
else:
    print("No significant difference between the group means")

In [None]:
# Create a density plot
plt.figure(figsize=(12, 6))

In [None]:
# Plot the density of each group
sns.histplot(group1, kde=True, label='Group 1', color='red', bins=20, stat="density", alpha=0.5)
sns.histplot(group2, kde=True, label='Group 2', color='blue', bins=20, stat="density", alpha=0.5)
sns.histplot(group3, kde=True, label='Group 3', color='orange', bins=20, stat="density", alpha=0.5)

In [None]:
# Add lines for the means of each group
plt.axvline(np.mean(group1), color='red', linestyle='--', linewidth=1)
plt.axvline(np.mean(group2), color='blue', linestyle='--', linewidth=1)
plt.axvline(np.mean(group3), color='orange', linestyle='--', linewidth=1)

In [None]:
# Set titles and labels with smaller font size
plt.title('Density Plot of Three Groups with Within-Group Variance', fontsize=14)
plt.xlabel('Value', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.legend(fontsize=10)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(False)

In [None]:
# Show the plot
plt.show()

In [None]:
"""# CHI-Square

### M&M Chocolate Dataset <br>
Each package of Milk Chocolate M&M's should contain
"""

In [None]:
from scipy.stats import chisquare

In [None]:
# Define the data as a dictionary
data = {
    "Color": ["Blue", "Brown", "Green", "Orange", "Red", "Yellow"],
    "Expected %": [24, 14, 16, 20, 13, 13],
    "Observed n": [481, 371, 483, 544, 372, 369],
    "Observed %": [18.36, 14.16, 18.44, 20.76, 14.20, 14.08]
}
# Create the DataFrame
mm = pd.DataFrame(data)
mm

In [None]:
# Calculate the total observed count
total_observed = mm["Observed n"].sum()

In [None]:
# Calculate the expected counts
mm["Expected n"] = mm["Expected %"] * total_observed / 100

In [None]:
# Perform the chi-square test
chi2_stat, p_val = chisquare(mm["Observed n"], f_exp=mm["Expected n"])

In [None]:
# Display the results
print("Chi-square statistic:", chi2_stat)
print("p-value:", p_val)

In [None]:
# Plotting the observed and expected counts
plt.figure(figsize=(10, 6))
bar_width = 0.35
index = range(len(mm))

In [None]:
plt.bar(index, mm["Observed n"], bar_width, label='Observed n')
plt.bar([i + bar_width for i in index], mm["Expected n"], bar_width, label='Expected n')

In [None]:
plt.xlabel('Color', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.title('Observed vs Expected Counts', fontsize=14)
plt.xticks([i + bar_width / 2 for i in index], mm["Color"], fontsize=12)
plt.legend(fontsize=12)

In [None]:
# Display the chi-square statistic and p-value in the plot
plt.text(0.5, max(mm["Observed n"]) * 0.9, f'Chi-square: {chi2_stat:.2f}\n p-value: {p_val:.2e}', fontsize=12)

In [None]:
plt.tight_layout()
plt.show()

In [None]:
"""# Next to Data Cleansing"""