In [1]:
from tensorflow.keras import layers, models, Model
import numpy as np
import pandas as pd

In [2]:
# Import the data
path = 'https://static.bc-edx.com/ai/ail-v-1-0/m19/lesson_3/datasets/wine_quality.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,7.2,0.39,0.63,11.0,0.044,55.0,156.0,0.9974,3.09,0.44,8.7,ok,white
1,6.9,0.63,0.02,1.9,0.078,18.0,30.0,0.99712,3.4,0.75,9.8,ok,red
2,6.9,0.3,0.33,4.1,0.035,26.0,155.0,0.9925,3.25,0.79,12.3,good,white
3,7.3,0.42,0.38,6.8,0.045,29.0,122.0,0.9925,3.19,0.37,12.6,good,white
4,6.9,0.18,0.38,8.1,0.049,44.0,176.0,0.9958,3.3,0.54,9.8,ok,white


In [3]:
# Preprocess y
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Preprocess "quality" column (one-hot encoding)
quality_encoder = OneHotEncoder(sparse_output=False)
quality_encoded = quality_encoder.fit_transform(df[['quality']])
quality_columns = quality_encoder.get_feature_names_out(['quality'])
df_quality_encoded = pd.DataFrame(quality_encoded, columns=quality_columns)

# Preprocess "color" column (label encoding for binary; one-hot encoding for multiple categories)
color_encoder = LabelEncoder()
df['color_encoded'] = color_encoder.fit_transform(df['color'])

# Concatenate the encoded columns to the original DataFrame
df_processed = pd.concat([df, df_quality_encoded], axis=1)

# Drop the original "quality" and "color" columns
df_processed = df_processed.drop(['quality', 'color'], axis=1)

df_processed.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,color_encoded,quality_bad,quality_good,quality_ok
0,7.2,0.39,0.63,11.0,0.044,55.0,156.0,0.9974,3.09,0.44,8.7,1,0.0,0.0,1.0
1,6.9,0.63,0.02,1.9,0.078,18.0,30.0,0.99712,3.4,0.75,9.8,0,0.0,0.0,1.0
2,6.9,0.3,0.33,4.1,0.035,26.0,155.0,0.9925,3.25,0.79,12.3,1,0.0,1.0,0.0
3,7.3,0.42,0.38,6.8,0.045,29.0,122.0,0.9925,3.19,0.37,12.6,1,0.0,1.0,0.0
4,6.9,0.18,0.38,8.1,0.049,44.0,176.0,0.9958,3.3,0.54,9.8,1,0.0,0.0,1.0


In [4]:
# Split data into X and two separate y variables
X = df_processed.drop(columns=['quality_good', 'quality_ok', 'quality_bad', 'color_encoded'])

y_color = df_processed['color_encoded']

y_quality = df_processed[['quality_good', 'quality_ok', 'quality_bad']]

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_color_train, y_color_test, y_quality_train, y_quality_test = train_test_split(X, y_color, y_quality)

In [5]:
# Create the shared layers of the model

# Input layer
input_layer = layers.Input(shape=(X.shape[1],), name='input_features')

# Shared hidden layers
shared_layer1 = layers.Dense(64, activation='relu')(input_layer)
shared_layer2 = layers.Dense(32, activation='relu')(shared_layer1)

In [6]:
# Branch for quality prediction
quality_output = layers.Dense(3, activation='softmax', name='quality_output')(shared_layer2)

# Branch for color prediction
color_output = layers.Dense(1, activation='sigmoid', name='color_output')(shared_layer2)

In [7]:
# Create the model
model = Model(inputs=input_layer, outputs=[quality_output, color_output])

# Compile the model
model.compile(optimizer='adam',
              loss={'quality_output': 'categorical_crossentropy', 'color_output': 'binary_crossentropy'},
              metrics={'quality_output': 'accuracy', 'color_output': 'accuracy'})

# Display the model summary
model.summary()

To fit the model to the data, we specify X_train as normal, but pass a dictionary for the y_data.

In [8]:
# Fit the model
model.fit(
    X,
    {'quality_output': y_quality, 'color_output': y_color},
    epochs=10,
    batch_size=32,
    validation_split=0.2
)

Epoch 1/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - color_output_accuracy: 0.7951 - loss: 2.1331 - quality_output_accuracy: 0.7163 - val_color_output_accuracy: 0.9362 - val_loss: 0.9450 - val_quality_output_accuracy: 0.7062
Epoch 2/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - color_output_accuracy: 0.9382 - loss: 0.7952 - quality_output_accuracy: 0.7758 - val_color_output_accuracy: 0.9354 - val_loss: 0.8535 - val_quality_output_accuracy: 0.7454
Epoch 3/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - color_output_accuracy: 0.9470 - loss: 0.8022 - quality_output_accuracy: 0.7598 - val_color_output_accuracy: 0.9369 - val_loss: 0.8671 - val_quality_output_accuracy: 0.7508
Epoch 4/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - color_output_accuracy: 0.9438 - loss: 0.8032 - quality_output_accuracy: 0.7596 - val_color_output_accuracy: 0.9392 - val_loss: 0.797

<keras.src.callbacks.history.History at 0x1f14192e3e0>

In [9]:
# Evaluate the model with the testing data
test_results = model.evaluate(X_test, {'quality_output': y_quality_test, 'color_output': y_color_test})
test_results

[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 975us/step - color_output_accuracy: 0.9717 - loss: 0.7376 - quality_output_accuracy: 0.7533


[0.7208593487739563, 0.9710769057273865, 0.7556923031806946]

In [10]:
# Print the quality and color accuracy
print(f"Quality Accuracy: {test_results[3]}")
print(f"Color Accuracy: {test_results[4]}")


IndexError: list index out of range

In [10]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import pipeline

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Load the dataset
file_path = '/mnt/data/Project 3 Baseball Stats.csv'
data = pd.read_csv(file_path)

# Display basic information about the dataset
print(data.info())
print(data.head())

# Data Exploration
# Check for missing values
print(data.isnull().sum())

# Statistical summary
print(data.describe())

# Data Cleaning and Preprocessing
# Fill or drop missing values, if any
data = data.dropna()

# Feature Engineering
# Example: Create a new feature 'Run_Difference' as 'Runs Scored - Runs Allowed'
data['Run_Difference'] = data['R'] - data['RA']

# Select features and target variable
features = data[['Run_Difference', 'HR', 'ERA', 'W']]
target = data['Playoffs']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initial Model Implementation using RandomForestClassifier from scikit-learn
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred_rf = model_rf.predict(X_test_scaled)

# Evaluate the model
print("RandomForestClassifier Evaluation")
print(classification_report(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))

# Plot feature importances
importances = model_rf.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices], color="r", align="center")
plt.xticks(range(X_train.shape[1]), features.columns[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()

# Advanced Model Implementation using Neural Network (Keras)
model_nn = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_nn.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the neural network model
loss, accuracy = model_nn.evaluate(X_test_scaled, y_test)
print(f"Neural Network Model Accuracy: {accuracy}")

# Predict on the test set
y_pred_nn = (model_nn.predict(X_test_scaled) > 0.5).astype("int32")

print("Neural Network Evaluation")
print(classification_report(y_test, y_pred_nn))
print(confusion_matrix(y_test, y_pred_nn))

# Advanced Model Implementation using Transformer Model from Hugging Face
# Assume we have some textual data for sentiment analysis or additional insights
textual_data = ["The team had a great season with many wins.", "The pitching performance was subpar this year."]

# Use VADER for sentiment analysis
analyzer = SentimentIntensityAnalyzer()
for sentence in textual_data:
    vs = analyzer.polarity_scores(sentence)
    print(f"{sentence}: {vs}")

# Load a pre-trained transformer model for sequence classification
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model_transformer = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")

# Tokenize and classify sample texts
inputs = tokenizer(textual_data, return_tensors="tf", padding=True, truncation=True)
outputs = model_transformer(inputs)
predictions = tf.nn.softmax(outputs.logits, axis=-1)

print("Transformer Model Predictions")
print(predictions)

# Summarize the project
print("Project Summary")
print("Data Exploration and Preprocessing steps were successfully completed.")
print("Initial model using RandomForestClassifier was implemented and evaluated.")
print("Advanced models using Neural Network and Transformer were implemented and evaluated.")
print("VADER sentiment analysis was performed on sample textual data.")
