# Drinks Dataset - Interactive Exploratory Data Analysis

This notebook provides an interactive exploration of the drinks dataset, containing alcohol consumption data for 193 countries.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)

## 1. Data Loading and Basic Inspection

In [None]:
# Load the dataset
df = pd.read_csv('drinks.csv')

print(f"Dataset shape: {df.shape}")
print(f"Number of countries: {df.shape[0]}")
print(f"Number of features: {df.shape[1]}")

# Display basic info
df.info()

In [None]:
# Display first few rows
df.head(10)

In [None]:
# Check for missing values and duplicates
print("Missing values per column:")
print(df.isnull().sum())
print(f"\nNumber of duplicate rows: {df.duplicated().sum()}")
print(f"\nUnique continents: {df['continent'].nunique()}")
print("\nCountries per continent:")
print(df['continent'].value_counts())

## 2. Descriptive Statistics

In [None]:
# Descriptive statistics for numerical columns
numerical_cols = ['beer_servings', 'spirit_servings', 'wine_servings', 'total_litres_of_pure_alcohol']
df[numerical_cols].describe()

In [None]:
# Distribution plots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for i, col in enumerate(numerical_cols):
    axes[i].hist(df[col], bins=20, alpha=0.7, edgecolor='black')
    axes[i].set_title(f'Distribution of {col.replace("_", " ").title()}')
    axes[i].set_xlabel(col.replace('_', ' ').title())
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 3. Continental Analysis

In [None]:
# Average consumption by continent
continent_avg = df.groupby('continent')[numerical_cols].mean().round(2)
continent_avg

In [None]:
# Continent comparison visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for i, col in enumerate(numerical_cols):
    continent_avg[col].plot(kind='bar', ax=axes[i])
    axes[i].set_title(f'Average {col.replace("_", " ").title()} by Continent')
    axes[i].set_ylabel('Servings' if 'servings' in col else 'Litres')
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Box plots by continent
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for i, col in enumerate(numerical_cols):
    sns.boxplot(data=df, x='continent', y=col, ax=axes[i])
    axes[i].set_title(f'{col.replace("_", " ").title()} by Continent')
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 4. Correlation Analysis

In [None]:
# Correlation matrix
correlation_matrix = df[numerical_cols].corr()
print("Correlation Matrix:")
print(correlation_matrix.round(3))

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.3f')
plt.title('Correlation Matrix - Alcohol Consumption Types')
plt.tight_layout()
plt.show()

In [None]:
# Scatter plots matrix
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Beer vs Total
axes[0].scatter(df['beer_servings'], df['total_litres_of_pure_alcohol'], alpha=0.6)
axes[0].set_xlabel('Beer Servings')
axes[0].set_ylabel('Total Litres of Pure Alcohol')
axes[0].set_title('Beer vs Total Alcohol Consumption')

# Wine vs Total
axes[1].scatter(df['wine_servings'], df['total_litres_of_pure_alcohol'], alpha=0.6)
axes[1].set_xlabel('Wine Servings')
axes[1].set_ylabel('Total Litres of Pure Alcohol')
axes[1].set_title('Wine vs Total Alcohol Consumption')

# Spirits vs Total
axes[2].scatter(df['spirit_servings'], df['total_litres_of_pure_alcohol'], alpha=0.6)
axes[2].set_xlabel('Spirit Servings')
axes[2].set_ylabel('Total Litres of Pure Alcohol')
axes[2].set_title('Spirits vs Total Alcohol Consumption')

plt.tight_layout()
plt.show()

## 5. Top Consumers Analysis

In [None]:
# Top 10 consumers for each alcohol type
for col in numerical_cols:
    print(f"\n=== TOP 10 {col.upper().replace('_', ' ')} CONSUMERS ===")
    top_10 = df.nlargest(10, col)[['country', col]]
    for idx, (_, row) in enumerate(top_10.iterrows(), 1):
        print(f"{idx:2d}. {row['country']}: {row[col]}")

In [None]:
# Countries with zero consumption
print("COUNTRIES WITH ZERO ALCOHOL CONSUMPTION:")
zero_countries = df[df['total_litres_of_pure_alcohol'] == 0]['country'].tolist()
print(f"Total: {len(zero_countries)} countries")
print("Countries:", ", ".join(zero_countries))

## 6. Interactive Exploration

In [None]:
# Function to explore specific countries
def explore_country(country_name):
    country_data = df[df['country'].str.contains(country_name, case=False, na=False)]
    if not country_data.empty:
        return country_data[['country', 'beer_servings', 'spirit_servings', 'wine_servings', 
                           'total_litres_of_pure_alcohol', 'continent']]
    else:
        return f"No country found containing '{country_name}'"

# Example usage - try different countries
print("Example: Search for 'United'")
explore_country('United')

In [None]:
# Function to compare countries
def compare_countries(countries_list):
    comparison = df[df['country'].isin(countries_list)]
    if not comparison.empty:
        fig, ax = plt.subplots(figsize=(12, 6))
        comparison.set_index('country')[numerical_cols].plot(kind='bar', ax=ax)
        plt.title('Country Comparison - Alcohol Consumption')
        plt.ylabel('Servings/Litres')
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        return comparison[['country', 'beer_servings', 'spirit_servings', 'wine_servings', 
                         'total_litres_of_pure_alcohol', 'continent']]
    else:
        return "No countries found in the list"

# Example comparison
example_countries = ['Germany', 'France', 'Italy', 'Spain']
print(f"Comparing: {', '.join(example_countries)}")
compare_countries(example_countries)

## 7. Summary Statistics by Continent

In [None]:
# Comprehensive continent statistics
continent_stats = df.groupby('continent')[numerical_cols].agg(['mean', 'median', 'std', 'min', 'max']).round(2)
continent_stats

## 8. Key Insights

### Major Findings:
1. **Europe** dominates in beer, wine, and total alcohol consumption
2. **North America** leads in spirit consumption
3. **Beer consumption** has the strongest correlation (0.836) with total alcohol consumption
4. **13 countries** report zero alcohol consumption, likely due to cultural/religious factors
5. **Wide variation** exists both within and between continents

### Top Global Consumers:
- **Beer**: Namibia (376 servings/year)
- **Spirits**: Grenada (438 servings/year) 
- **Wine**: France (370 servings/year)
- **Total Alcohol**: Belarus (14.4 litres/year)

Use the cells above to explore specific countries and create custom comparisons!