In [None]:
#A decision tree is a supervised learning algorithm used for both classification and regression tasks in machine learning. It is a tree-like structure where each internal node represents a feature, the branch represents a decision rule based on that feature, and each leaf node represents the outcome. Decision trees are intuitive and easy to understand, making them popular for both analysis and interpretation of data.

#Here's how a decision tree works:

"""1. Training Phase:
   - Given a dataset with features and corresponding labels, the decision tree algorithm recursively splits the dataset into subsets based on the values of the features.
   - At each step, the algorithm selects the best feature to split the data. The "best" feature is chosen based on a criterion that maximizes the information gain or minimizes impurity.
   - Common splitting criteria include Gini impurity, entropy, or classification error for classification tasks, and variance reduction for regression tasks.
   - This process continues recursively until one of the stopping conditions is met, such as reaching a maximum depth, having a minimum number of samples in a node, or no further improvement in impurity reduction.

2. Testing Phase:
   - Once the decision tree is trained, it can be used to make predictions on new, unseen data.
   - Given an input instance, the decision tree traverses from the root node down to a leaf node following the decision rules based on the features' values.
   - At each internal node, the decision tree evaluates the feature value and moves to the appropriate child node according to the decision rule.
   - When reaching a leaf node, the prediction is made based on the majority class in the case of classification or the mean value in the case of regression.

Decision trees have several advantages:

- Interpretability: Decision trees are easy to interpret and visualize, making them useful for understanding the decision-making process.
- Non-parametric: Decision trees make no assumptions about the underlying distribution of the data, making them suitable for both linear and non-linear relationships.
- Handle Both Numerical and Categorical Data: Decision trees can handle both numerical and categorical features without requiring pre-processing like one-hot encoding.
- Feature Importance: Decision trees can provide information about feature importance, which can be helpful for feature selection and understanding the importance of different variables in predicting the target variable.

However, decision trees also have limitations:

- Overfitting: Decision trees are prone to overfitting, especially when the tree depth is not properly controlled or when the dataset is noisy.
- Instability: Small changes in the data can result in significantly different trees, leading to instability.
- Bias towards Features with Many Levels**: Decision trees tend to bias towards features with a large number of levels or categories.
- Single Decision Boundaries: Decision trees create axis-parallel decision boundaries, which may not capture complex relationships in the data.

To address some of these limitations, ensemble methods like random forests and gradient boosting are often used, which combine multiple decision trees to improve performance and robustness.
"""

In [None]:
import pandas as pd
import os

In [None]:
wine_data = pd.read_csv("wines_SPA.csv", sep=",")

In [None]:
wine_data.head()

In [None]:
import seaborn as sns
g = sns.pairplot(data=wine_data, diag_kind="kde", dropna=True)
g.map_lower(sns.kdeplot, levels=4, color=".2")


In [None]:
wine_data.info()

In [None]:
# Summary statistics
summary_stats = wine_data.describe()
print(summary_stats)

In [None]:
# Distribution of the target variable
plt.figure(figsize=(8, 6))
sns.histplot(wine_data['rating'], kde=True, bins=20, color='skyblue')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.title('Distribution of Wine Ratings')
plt.show()

In [None]:
# Correlation matrix
correlation_matrix = wine_data.corr()

# Plot the correlation matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Pairplot
sns.pairplot(wine_data, vars=['rating', 'num_reviews', 'price', 'body', 'acidity'])
plt.suptitle('Pairplot of Numerical Features', y=1.02)
plt.show()

In [None]:
# Boxplot of numerical features by wine type
plt.figure(figsize=(12, 8))
sns.boxplot(data=wine_data, x='type', y='rating')
plt.xlabel('Wine Type')
plt.ylabel('Rating')
plt.title('Distribution of Ratings by Wine Type')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Countplot of wine types
plt.figure(figsize=(8, 6))
sns.countplot(data=wine_data, x='type')
plt.xlabel('Wine Type')
plt.ylabel('Count')
plt.title('Count of Wine Types')
plt.xticks(rotation=45)
plt.show()

In [None]:
wine_data.fillna(method='ffill', inplace=True)  # Forward fill missing values

# Convert categorical variables into numerical format using one-hot encoding
wine_data = pd.get_dummies(wine_data)

In [None]:
X = wine_data.drop('rating', axis=1)  # Features
y = wine_data['rating']  # Target variable

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.tree import DecisionTreeRegressor

# Initialize the decision tree model
dtree = DecisionTreeRegressor()

# Train the model
dtree.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error

# Predictions on the testing set
y_pred = dtree.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Convert Index object to a list
feature_names = list(X.columns)

# Plot the decision tree
plt.figure(figsize=(20,10))
plot_tree(dtree, feature_names=feature_names, filled=True)
plt.show()

In [None]:
# Define rating categories
low_threshold = 3.5
high_threshold = 4.5

# Create a new categorical target variable based on rating categories
wine_data['rating_category'] = pd.cut(wine_data['rating'], bins=[0, low_threshold, high_threshold, 5], labels=['Low', 'Medium', 'High'])

# Split the data into features (X) and the new categorical target variable (y)
X = wine_data.drop(['rating', 'rating_category'], axis=1)
y = wine_data['rating_category']

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the decision tree model
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

# Predictions on the testing set
y_pred = dtree.predict(X_test)

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Plot the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()