In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

In [3]:
raw_csv_data = pd.read_csv('wines_SPA.csv')
df = raw_csv_data.copy()

In [4]:
df_cleaned = df.dropna()
df_cleaned = df_cleaned.drop(['country', 'wine'], axis=1)

In [5]:
top10_wineries = df_cleaned['winery'].value_counts().nlargest(10).index
df_cleaned['winery_top10'] = df_cleaned['winery'].where(df_cleaned['winery'].isin(top10_wineries), "Other")
df_cleaned = df_cleaned.drop('winery', axis=1)

In [6]:
df_cleaned = df_cleaned[df_cleaned['year'] != 'N.V.'].copy()
df_cleaned['year'] = df_cleaned['year'].astype(int)
data_year = 2022
df_cleaned['wine_age'] = data_year - df_cleaned['year']
df_cleaned = df_cleaned.drop('year', axis=1)

In [7]:
df_cleaned['high_rating'] = (df_cleaned['rating'] >= 4.5).astype(int)
df_cleaned = df_cleaned.drop('rating', axis=1)

In [8]:
df_minority = df_cleaned[df_cleaned['high_rating'] == 1]
df_majority = df_cleaned[df_cleaned['high_rating'] == 0]
df_majority_downsampled = df_majority.sample(n=len(df_minority), random_state=42)
df_balanced = pd.concat([df_minority, df_majority_downsampled])
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [9]:
X = df_balanced.drop('high_rating', axis=1)
y = df_balanced['high_rating']

In [10]:
X = pd.get_dummies(X, drop_first=True)

In [11]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [13]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [29]:
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8284518828451883
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       130
           1       0.82      0.80      0.81       109

    accuracy                           0.83       239
   macro avg       0.83      0.83      0.83       239
weighted avg       0.83      0.83      0.83       239

