## Decision Trees with wine data

- Data precrocessing will be kept to a minimum, because there are no null values, no missing values, and all values are numerical.
- Features to be engineered are all 11 phsyiochemical inputs, providing the most data to train the model.
- Training data will represent 80% of the data
- Testing data will represent 20% of the data
- A Decision tree model was chosen, because:
    - Despite the longer computation expense, the dataset being used is reasonably small and manageable w/respect to computation time
    - Because missing data effects this type of model the least, this allows us to use this type of model in the future if we have less information about a wine

In [28]:
# Initial imports
import psycopg2
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

In [29]:
# Created a connection string to connect to the AWS database
conn = psycopg2.connect(host="dbfinalproject.c5qdbdj5dsfl.us-east-2.rds.amazonaws.com", port = 5432, database="dbRedwine", user="postgres", password="Raindrops12")

In [30]:
# Query run of all the data from the redwine table in pgadmin

df_wine = pd.read_sql("""
SELECT * FROM redwine  
""",con= conn)

df_wine.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.08,11.0,34.0,1.0,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.1,25.0,67.0,1.0,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.09,15.0,54.0,1.0,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.08,17.0,60.0,1.0,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.08,11.0,34.0,1.0,3.51,0.56,9.4,5


In [31]:
# Query run of all the data from the quality_description table in pgadmin

df_qualitydes = pd.read_sql("""
SELECT * FROM quality_description
""",con= conn)

df_qualitydes.head()

Unnamed: 0,quality,description
0,0,Not Good
1,1,Not Good
2,2,Not Good
3,3,Not Good
4,4,Average


In [32]:
## Joined the two tables
df_joined = pd.read_sql("""
SELECT a.*,b.description FROM redwine a inner join quality_description b on a.quality = b.quality 
""",con= conn)

df_joined.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality,description
0,7.4,0.7,0.0,1.9,0.08,11.0,34.0,1.0,3.51,0.56,9.4,5,Average
1,7.8,0.88,0.0,2.6,0.1,25.0,67.0,1.0,3.2,0.68,9.8,5,Average
2,7.8,0.76,0.04,2.3,0.09,15.0,54.0,1.0,3.26,0.65,9.8,5,Average
3,11.2,0.28,0.56,1.9,0.08,17.0,60.0,1.0,3.16,0.58,9.8,6,Average
4,7.4,0.7,0.0,1.9,0.08,11.0,34.0,1.0,3.51,0.56,9.4,5,Average


## Preprocessing Wine Data

In [33]:
# Define features set
X = df_joined.copy()
X = X.drop(columns =["quality", "description"])
X.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.08,11.0,34.0,1.0,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.1,25.0,67.0,1.0,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.09,15.0,54.0,1.0,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.08,17.0,60.0,1.0,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.08,11.0,34.0,1.0,3.51,0.56,9.4


In [34]:
# Define target vector
y = df_joined["description"].values.reshape(-1, 1)
y[:5]

array([['Average'],
       ['Average'],
       ['Average'],
       ['Average'],
       ['Average']], dtype=object)

In [35]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [36]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1199, 11)
(400, 11)
(1199, 1)
(400, 1)


In [37]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)

In [38]:
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(1279, 11)
(320, 11)
(1279, 1)
(320, 1)


In [39]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [40]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [41]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


## Fitting the Decision Tree Model

In [42]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [43]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

## Making Predictions Using the Tree Model

In [44]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

## Model Evaluation

In [45]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

ValueError: Shape of passed values is (3, 3), indices imply (2, 2)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


## Visualizng the Decision Tree

In [None]:
# Create DOT data
dot_data = tree.export_graphviz(
    model, 
    out_file=None, 
    feature_names=X.columns, 
    class_names=["0", "1"],
    filled=True
)
# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)
# Show graph
Image(graph.create_png())

In [None]:
 # Saving the tree as PDF
file_path = Path("Resources/wine_tree.pdf")
graph.write_pdf(file_path)
# Saving the tree as PNG
file_path = Path("Resources/wine_tree.png")
graph.write_png(file_path)