In [2]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import plotly.express as px
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
df_red = pd.read_csv("data/winequality-red.csv", sep=";")
df_white = pd.read_csv("data/winequality-white.csv", sep=";")

In [4]:
x_white = preprocessing.normalize(df_white.iloc[:, :11])
x_red = preprocessing.normalize(df_red.iloc[:, :11])

fig = px.scatter_matrix(x_red, width=2000, height=2000)
fig.show()

In [None]:
fig = px.imshow(df_red.corr(), width=500, height=500)
fig.show()

In [None]:
fig = px.imshow(df_white.corr(), width=500, height=500)
fig.show()

In [5]:
y_white = df_white.iloc[:, 11]
y_red = df_red.iloc[:, 11]
x_white = preprocessing.normalize(df_white.iloc[:, :11])
x_red = preprocessing.normalize(df_red.iloc[:, :11])

In [None]:
X_white_train, X_white_test, y_white_train, y_white_test = train_test_split(
    x_white, y_white, test_size=0.3, random_state=42
)

model = LinearRegression()

model.fit(X_white_train, y_white_train)
model.score(X_white_test, y_white_test)

In [None]:
X_red_train, X_red_test, y_red_train, y_red_test = train_test_split(
    x_red, y_red, test_size=0.3, random_state=42
)

model = LinearRegression()

model.fit(X_red_train, y_red_train)
model.score(X_red_test, y_red_test)

In [20]:
pca = PCA(n_components=10)


x_white_pca = pca.fit_transform(x_white)
(
    X_white_pca_train,
    X_white_pca_test,
    y_white_pca_train,
    y_white_pca_test,
) = train_test_split(x_white_pca, y_white, test_size=0.3, random_state=42)

PC_values = np.arange(pca.n_components_) + 1
px.line(
    x=PC_values,
    y=pca.explained_variance_ratio_,
    title="Scree plot",
    labels={"x": "N Components", "y": "Variance explained"},
)

In [None]:
model = LinearRegression()

model.fit(X_white_pca_train, y_white_pca_train)
model.score(X_white_pca_test, y_white_pca_test)

In [None]:
pca = PCA(n_components=2)


x_red_pca = pca.fit_transform(x_red)
X_red_pca_train, X_red_pca_test, y_red_pca_train, y_red_pca_test = train_test_split(
    x_red_pca, y_red, test_size=0.3, random_state=42
)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

X_train, X_test, y_train, y_test = train_test_split(x_red, y_red, test_size=0.3)
model = make_pipeline(PolynomialFeatures(2), LinearRegression())
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_white, y_white, test_size=0.3)
model = make_pipeline(PolynomialFeatures(2), LinearRegression())
model.fit(X_train, y_train)
model.score(X_test, y_test)