# Notebook

In [None]:
pip install ucimlrepo



In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
wine = fetch_ucirepo(id=109)

# data (as pandas dataframes)
X = wine.data.features
y = wine.data.targets

# metadata
print(wine.metadata)

# variable information
print(wine.variables)


{'uci_id': 109, 'name': 'Wine', 'repository_url': 'https://archive.ics.uci.edu/dataset/109/wine', 'data_url': 'https://archive.ics.uci.edu/static/public/109/data.csv', 'abstract': 'Using chemical analysis to determine the origin of wines', 'area': 'Physics and Chemistry', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 178, 'num_features': 13, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1992, 'last_updated': 'Mon Aug 28 2023', 'dataset_doi': '10.24432/C5PC7J', 'creators': ['Stefan Aeberhard', 'M. Forina'], 'intro_paper': {'title': 'Comparative analysis of statistical pattern recognition methods in high dimensional settings', 'authors': 'S. Aeberhard, D. Coomans, O. Vel', 'published_in': 'Pattern Recognition', 'year': 1994, 'url': 'https://www.semanticscholar.org/paper/83dc3e4030d7b9fbdbb4bde03ce12ab70ca10528', 'do

In [None]:
import pandas as pd

df = pd.DataFrame(data=wine.data.features, columns=wine.data.feature_names)
df['target'] = wine.data.targets

# Data types
print(df.dtypes)

# Basic statistics
print(df.describe())

print(df.head())

Alcohol                         float64
Malicacid                       float64
Ash                             float64
Alcalinity_of_ash               float64
Magnesium                         int64
Total_phenols                   float64
Flavanoids                      float64
Nonflavanoid_phenols            float64
Proanthocyanins                 float64
Color_intensity                 float64
Hue                             float64
0D280_0D315_of_diluted_wines    float64
Proline                           int64
target                            int64
dtype: object
          Alcohol   Malicacid         Ash  Alcalinity_of_ash   Magnesium  \
count  178.000000  178.000000  178.000000         178.000000  178.000000   
mean    13.000618    2.336348    2.366517          19.494944   99.741573   
std      0.811827    1.117146    0.274344           3.339564   14.282484   
min     11.030000    0.740000    1.360000          10.600000   70.000000   
25%     12.362500    1.602500    2.210000     

In [None]:
!pip uninstall ydata_profiling
!pip install ydata_profiling

Found existing installation: ydata-profiling 4.6.4
Uninstalling ydata-profiling-4.6.4:
  Would remove:
    /usr/local/bin/pandas_profiling
    /usr/local/bin/ydata_profiling
    /usr/local/lib/python3.10/dist-packages/pandas_profiling/*
    /usr/local/lib/python3.10/dist-packages/ydata_profiling-4.6.4.dist-info/*
    /usr/local/lib/python3.10/dist-packages/ydata_profiling/*
Proceed (Y/n)? 

In [None]:
import pandas as pd
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Profiling Report")
profile.to_notebook_iframe()

In [None]:
# Analyze key data quality metrics such as completeness, uniqueness, and missing values.
completeness = df.describe(include='all').transpose()[['count']]
total_rows = df.shape[0]
completeness_percentage = (completeness / total_rows) * 100

uniqueness = df.nunique()
missingValues = df.isnull().sum()

print('completeness (%)')
print(completeness_percentage)
print('\n')
print('uniqueness')
print(uniqueness)
print('\n')
print('missingValues')
print(missingValues)
print('\n')

In [None]:
# Explore the distribution of each chemical feature
import matplotlib.pyplot as plt
chefeatures = df.drop(columns=['target'])

for feature in chefeatures.columns:
    plt.figure(figsize=(8, 6))
    plt.hist(chefeatures[feature], bins=20, alpha=0.7)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Calculate correlations between different chemical features.
chefeatures = df.drop(columns=['target'])
correlation_matrix = chefeatures.corr()
print(correlation_matrix)


In [None]:
# Identify patterns, summary statistics, and potential outliers related to the chemical features.
import seaborn as sns
chefeatures = df.drop(columns=['target'])

# Identify patterns
sns.pairplot(data=chefeatures)
plt.suptitle('Chemical Features')
plt.show()

# Summary statistics
print(chefeatures.describe())

# Identify potential outliers
plt.figure(figsize=(12, 8))
sns.boxplot(data=chefeatures, orient='v', palette='Set1')
plt.title('Chemical Features')
plt.ylabel('Values')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# Use either Seaborn Plotly or both to generate some useful data vsiaulizations.
import plotly.express as px

fig = px.scatter_3d(df, x='Alcohol', y='Malicacid', z='Ash', color='target', size_max=5)
fig.update_layout(title_text='Chemical Features', scene=dict(zaxis=dict(title='Ash')))
fig.show()

In [None]:
# Create scatter plots or pair plots to visually inspect relationships between different chemical features.
import plotly.express as px
# Scatter plot with Plotly
sample = df.sample(50)
fig = px.scatter(
  sample, x="Total_phenols", y="Flavanoids"
)
fig.update_layout(title_text='Correlations  between Total_phenols and Flavanoids')
fig.show()

In [None]:
# Implement box plots or violin plots to highlight chemical feature distributions.
plt.figure(figsize=(12, 8))
sns.violinplot(data=chefeatures, palette='Set2')
plt.title('Seaborn Violin Plot of Chemical Features')
plt.ylabel('Values')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# Utilize appropriate visualizations to showcase any insights gained during the EDA and data profiling stages.
# => To identify the top contributors to the target variable

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X, y)

feature_importances = model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df[feature_importance_df['Feature'] != 'target']
top_features = feature_importance_df.sort_values(by='Importance', ascending=False).head(3)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=top_features)
plt.title('Top 3 Contributors to Target')
plt.show()