In [83]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
import ast
from scipy.stats import uniform, randint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

In [40]:
wine_pd = pd.read_csv('wine_pd.csv')
wine_pd = wine_pd[['tokens','points']]

In [41]:
train, test = train_test_split(wine_pd, random_state=42)

X_train = train['tokens']
y_train = train['points']
X_test = test['tokens']
y_test = test['points']

In [42]:
X_train = X_train.apply(lambda x: ast.literal_eval(x))

# Join the lists into single strings
X_train = X_train.apply(lambda tokens: ' '.join(tokens))

X_train.head()

118969    aromatically dazzling wine shows complex notes...
3136      earthy deep true uco valley roots good fairly ...
77944     made moscato giallo grapes harvested late thou...
64964     fresh informal lean easygoing red offers aroma...
5082      snappy dry redfruit citrus aromas good opening...
Name: tokens, dtype: object

In [43]:
y_train.head()

118969    94
3136      89
77944     90
64964     85
5082      87
Name: points, dtype: int64

In [71]:
# Define the intervals
intervals = [80, 85, 90, 95, 100]

# Split the numbers into intervals
y_train_intervals = pd.cut(y_train, bins=intervals, labels=intervals[:-1], include_lowest=True)
y_test_intervals = pd.cut(y_test, bins=intervals, labels=intervals[:-1], include_lowest=True)
# Print the result



y_train_intervals.head()


118969    90
3136      85
77944     85
64964     80
5082      85
Name: points, dtype: category
Categories (4, int64): [80 < 85 < 90 < 95]

In [45]:
X_test = X_test.apply(lambda x: ast.literal_eval(x))

# Join the lists into single strings
X_test = X_test.apply(lambda tokens: ' '.join(tokens))

X_test.head()

77718    possibly little sweet soft easygoing chardonna...
67681    soft almost dry wine full mouth caramel spice ...
69877    generic whitefruit aromas peach apple slightly...
46544    winerys best nebula years still little soft sw...
186      rich pinot whose primary virtue fruit explodes...
Name: tokens, dtype: object

In [72]:


# Assuming wine_train_X, wine_train_y, wine_test_X, wine_test_y are already loaded

# Step 1: Create a pipeline with CountVectorizer and MultinomialNB
model = make_pipeline(CountVectorizer(), MultinomialNB())

# Step 2: Train the model with the training data
model.fit(X_train, y_train)

# Step 3: Predict labels for the test data
predictions = model.predict(X_test)

# Optional: Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 23.64%


# Naive Bayes

In [84]:
nb_pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Define the parameter space for the random search
nb_parameters = {
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'vect__max_features': randint(10000,30000),  # number of features to consider
    'vect__max_df': (0.5, 0.75, 1.0),
    'clf__alpha': np.linspace(0.1, 2, 20),  # smoothing parameter
}

# Set up the random search with cross-validation
nb_random_search = RandomizedSearchCV(nb_pipeline, nb_parameters, n_iter=10, cv=5, random_state=42,verbose=3,return_train_score=True, n_jobs=-1)

# Execute the random search
nb_random_search.fit(X_train, y_train_intervals)

# Output the best parameters
print("Best parameters set:")
print(nb_random_search.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END clf__alpha=0.7, vect__max_df=0.5, vect__max_features=15390;, score=(train=0.722, test=0.693) total time=   1.5s
[CV 2/5] END clf__alpha=0.7, vect__max_df=0.5, vect__max_features=15390;, score=(train=0.723, test=0.689) total time=   1.5s
[CV 4/5] END clf__alpha=0.7, vect__max_df=0.5, vect__max_features=15390;, score=(train=0.723, test=0.690) total time=   1.5s
[CV 5/5] END clf__alpha=0.7, vect__max_df=0.5, vect__max_features=15390;, score=(train=0.723, test=0.691) total time=   1.5s
[CV 1/5] END clf__alpha=1.0999999999999999, vect__max_df=0.5, vect__max_features=21284;, score=(train=0.699, test=0.672) total time=   1.5s
[CV 3/5] END clf__alpha=0.7, vect__max_df=0.5, vect__max_features=15390;, score=(train=0.722, test=0.689) total time=   1.6s
[CV 2/5] END clf__alpha=1.0999999999999999, vect__max_df=0.5, vect__max_features=21284;, score=(train=0.699, test=0.671) total time=   1.6s
[CV 3/5] END clf__alpha=1.09999999

In [85]:
score = nb_random_search.score(X_test, y_test_intervals)
print(f"Random Search Score: {score * 100:.2f}%")


Random Search Score: 69.44%


# SVM

In [95]:
# Step 1: Create a pipeline with CountVectorizer and SVM
model = make_pipeline(CountVectorizer(), SVC())

# Step 2: Random Search
param_distributions = {
    'svc__C': uniform(0.1, 10),
    'svc__gamma': uniform(0.1, 1)
}

random_search = RandomizedSearchCV(model, param_distributions, n_iter=5, cv=3, random_state=42, verbose=3, return_train_score=True, n_jobs=-1)

# Step 3: Train the model with the training data
random_search.fit(X_train, y_train_intervals)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV 1/3] END svc__C=0.6808361216819946, svc__gamma=0.9661761457749352;, score=(train=0.595, test=0.574) total time=108.0min
[CV 1/3] END svc__C=1.6601864044243653, svc__gamma=0.2559945203362026;, score=(train=1.000, test=0.614) total time=330.6min
[CV 2/3] END svc__C=1.6601864044243653, svc__gamma=0.2559945203362026;, score=(train=1.000, test=0.617) total time=417.1min
[CV 3/3] END svc__C=1.6601864044243653, svc__gamma=0.2559945203362026;, score=(train=1.000, test=0.616) total time=433.2min
[CV 1/3] END svc__C=7.41993941811405, svc__gamma=0.6986584841970366;, score=(train=1.000, test=0.593) total time=831.3min
[CV 1/3] END svc__C=3.845401188473625, svc__gamma=1.0507143064099163;, score=(train=1.000, test=0.593) total time=832.9min
[CV 2/3] END svc__C=3.845401188473625, svc__gamma=1.0507143064099163;, score=(train=1.000, test=0.594) total time=833.6min


KeyboardInterrupt: 

In [52]:
X_test.head()

77718    possibly little sweet soft easygoing chardonna...
67681    soft almost dry wine full mouth caramel spice ...
69877    generic whitefruit aromas peach apple slightly...
46544    winerys best nebula years still little soft sw...
186      rich pinot whose primary virtue fruit explodes...
Name: tokens, dtype: object

# KNN

In [70]:
from sklearn.neighbors import KNeighborsRegressor
KNN_regression_model = make_pipeline(CountVectorizer(), KNeighborsRegressor())
KNN_regression_model.fit(X_train, y_train_intervals)

# Step 2: Define the parameter space for the random search

parameters = {
    'countvectorizer__max_features': randint(8000,15000),  # number of words in the vocabulary
    'countvectorizer__max_df': (0.5, 0.75, 1.0),
    'kneighborsregressor__n_neighbors': [3, 5, 7, 10, 15],
    'kneighborsregressor__weights': ['uniform', 'distance'],
}

# Step 3: Set up the random search with cross-validation
KNN_random_search = RandomizedSearchCV(KNN_regression_model, parameters, n_iter=5, cv=3,verbose=3, random_state=42,return_train_score=True)

KNN_random_search.fit(X_train, y_train_intervals)

# Output the best parameters
print("Best parameters set:")
print(KNN_random_search.best_params_)
print(KNN_random_search.best_score_)


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV 1/3] END countvectorizer__max_df=1.0, countvectorizer__max_features=8860, countvectorizer__ngram_range=(1, 1), kneighborsregressor__n_neighbors=7, kneighborsregressor__weights=distance;, score=(train=1.000, test=-1.025) total time=  56.9s
[CV 2/3] END countvectorizer__max_df=1.0, countvectorizer__max_features=8860, countvectorizer__ngram_range=(1, 1), kneighborsregressor__n_neighbors=7, kneighborsregressor__weights=distance;, score=(train=1.000, test=-0.722) total time=  55.1s
[CV 3/3] END countvectorizer__max_df=1.0, countvectorizer__max_features=8860, countvectorizer__ngram_range=(1, 1), kneighborsregressor__n_neighbors=7, kneighborsregressor__weights=distance;, score=(train=1.000, test=-0.658) total time=  51.6s
[CV 1/3] END countvectorizer__max_df=0.5, countvectorizer__max_features=11092, countvectorizer__ngram_range=(1, 1), kneighborsregressor__n_neighbors=5, kneighborsregressor__weights=uniform;, score=(train=-0.447,

# Decision Tree Regressor

In [68]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import randint

# Step 1: Create a pipeline with CountVectorizer and DecisionTreeRegressor
decision_tree_model = make_pipeline(CountVectorizer(), DecisionTreeRegressor())

# Step 2: Randomized search
parameters = {
    'countvectorizer__max_features': randint(10000, 20000),
    'countvectorizer__max_df': (0.5, 0.75, 1.0),
    'decisiontreeregressor__max_depth': [20,25,30,35,40],
}

decision_tree_random_search = RandomizedSearchCV(decision_tree_model, parameters, n_iter=10, cv=3, random_state=42,verbose=3,return_train_score=True)

decision_tree_random_search.fit(X_train, y_train_intervals)

# Output the best parameters

print("Best parameters set:")
print(decision_tree_random_search.best_params_)
print(decision_tree_random_search.best_score_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3] END countvectorizer__max_df=1.0, countvectorizer__max_features=10860, countvectorizer__ngram_range=(1, 1), decisiontreeregressor__max_depth=30;, score=(train=0.547, test=0.258) total time=   5.2s
[CV 2/3] END countvectorizer__max_df=1.0, countvectorizer__max_features=10860, countvectorizer__ngram_range=(1, 1), decisiontreeregressor__max_depth=30;, score=(train=0.556, test=0.251) total time=   5.4s
[CV 3/3] END countvectorizer__max_df=1.0, countvectorizer__max_features=10860, countvectorizer__ngram_range=(1, 1), decisiontreeregressor__max_depth=30;, score=(train=0.557, test=0.261) total time=   5.5s
[CV 1/3] END countvectorizer__max_df=0.5, countvectorizer__max_features=15734, countvectorizer__ngram_range=(1, 2), decisiontreeregressor__max_depth=30;, score=(train=0.545, test=0.262) total time=   8.7s
[CV 2/3] END countvectorizer__max_df=0.5, countvectorizer__max_features=15734, countvectorizer__ngram_range=(1, 2), dec

In [None]:
prepoceesing = make_column_transformer(
    (make_pipeline(SimpleImputer(strategy='mean'), StandardScaler()), ['points']),
    
    (CountVectorizer(), 'tokens')
)

In [97]:
# Random Forest
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the pipeline
rf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestRegressor())
])

# Parameters for RandomizedSearchCV
rf_param_dist = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'rf__n_estimators': randint(100, 500),
    'rf__max_depth': randint(3, 10),
    'rf__min_samples_split': randint(2, 11),
    'rf__min_samples_leaf': randint(1, 11)
}

# Set up RandomizedSearchCV
rf_random_search = RandomizedSearchCV(rf_pipeline, param_distributions=rf_param_dist, 
                                       n_iter=6, cv=3, scoring='neg_mean_squared_error', random_state=42, verbose=3, n_jobs=-1, return_train_score=True)

# Fit the model
rf_random_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters:", rf_random_search.best_params_)
print("Best score (MSE):", -rf_random_search.best_score_)


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END rf__max_depth=9, rf__min_samples_leaf=4, rf__min_samples_split=9, rf__n_estimators=288, tfidf__ngram_range=(1, 1);, score=(train=-6.527, test=-6.770) total time= 6.3min
[CV 2/3] END rf__max_depth=5, rf__min_samples_leaf=8, rf__min_samples_split=6, rf__n_estimators=199, tfidf__ngram_range=(1, 2);, score=(train=-7.451, test=-7.788) total time= 6.3min
[CV 3/3] END rf__max_depth=5, rf__min_samples_leaf=8, rf__min_samples_split=6, rf__n_estimators=199, tfidf__ngram_range=(1, 2);, score=(train=-7.467, test=-7.871) total time= 6.4min
[CV 2/3] END rf__max_depth=9, rf__min_samples_leaf=10, rf__min_samples_split=4, rf__n_estimators=314, tfidf__ngram_range=(1, 1);, score=(train=-6.622, test=-6.815) total time= 6.4min
[CV 1/3] END rf__max_depth=9, rf__min_samples_leaf=10, rf__min_samples_split=4, rf__n_estimators=314, tfidf__ngram_range=(1, 1);, score=(train=-6.569, test=-6.776) total time= 6.5min
[CV 3/3] END rf__max_depth=9

In [98]:
rf_random_search.score(X_test, y_test)

-6.87858949283535