In [11]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.metrics import classification_report
import scipy.sparse
import time
import pickle
from joblib import dump, load

from scipy import sparse


import pandas as pd
import numpy as np
import xgboost, textblob, string

In [12]:
data = pd.read_csv('winemag-data_first150k.csv', index_col=0)
#data.head()

In [13]:
conditions = [
    (data['points'] <= 84),
    (data['points'] > 84) & (data['points'] <= 88),
    (data['points'] > 88) & (data['points'] <= 92),
    (data['points'] > 92) & (data['points'] <= 96),
    (data['points'] > 96) & (data['points'] <= 100)
    ]

# create a list of the values we want to assign for each condition
class_list = [0,1,2,3,4]#['Low', 'OK', 'Good', 'Very Good', 'Excellent']

# create a new column and use np.select to assign values to it using our lists as arguments
data['class'] = np.select(conditions, class_list)

data = data[['description','winery', 'country','class']] # selecting columns
data = data.dropna() # deleting na rows
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150925 entries, 0 to 150929
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   description  150925 non-null  object
 1   winery       150925 non-null  object
 2   country      150925 non-null  object
 3   class        150925 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 5.8+ MB


In [14]:
data["united"] = data["country"] + " " + data['winery'] + " " + data["description"]

In [15]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(data['united'],
                                                                    data['class'],
                                                                    test_size=0.33,
                                                                    random_state=42)

In [16]:
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

In [17]:
# create a count vectorizer object 
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}')
tfidf_vect.fit(data['united'])

# transform the training and validation data using count vectorizer object
X_train =  tfidf_vect.transform(X_train)
X_test =  tfidf_vect.transform(X_test)

In [20]:
def train_model(classifier,as_features,X_train,X_test, y_train,y_test):
    # fit the training dataset on the classifier
    classifier.fit(X_train, y_train)
    
    dump(classifier,  type(classifier).__name__ + "_"+ as_features+'.joblib') 
    # predict the labels on validation dataset
    predictions = classifier.predict(X_test)
    
    return metrics.accuracy_score(predictions, y_test)

In [21]:
# Naive Bayes on Tfidf Vectors
accuracy = train_model(naive_bayes.MultinomialNB(),"count",X_train, X_test, y_train,y_test)
print ("Naive Bayes:", accuracy)

Naive Bayes: 0.6332369594024816


In [22]:
# RF on Tfidf Vectors
a = time.time()
accuracy = train_model(ensemble.RandomForestClassifier(n_jobs = -1),"tfidf",X_train,X_test, y_train,y_test)
print ("Random Forest: ", accuracy, 'time: ', (time.time() -a)/60 ,'min')

Random Forest:  0.7888206240212022 time:  5.567138675848643 min


In [23]:
# DT on Tfidf Vectors
a = time.time()
from sklearn.tree import DecisionTreeClassifier
accuracy = train_model(DecisionTreeClassifier(criterion='gini' ),"tfidf",X_train,X_test, y_train,y_test)
print ("Decision Tree:: ", accuracy, 'time: ', (time.time() -a)/60 ,'min')

Decision Tree::  0.7387664136850982 time:  2.4126781384150187 min


In [24]:
# LG on Tfidf Vectors
a = time.time()
accuracy = train_model(linear_model.LogisticRegression(max_iter = 500),"tfidf",X_train,X_test, y_train,y_test)
print ("Logistic Regression: ", accuracy, 'time: ', (time.time() -a)/60 ,'min')

Logistic Regression:  0.72918925430671 time:  1.3308545788129171 min


In [32]:
#MLP on Tfidf Vectors
a = time.time()
accuracy = train_model(MLPClassifier(), "tfidf",X_train,X_test, y_train,y_test)
print ("MLP: ", accuracy, 'time: ', (time.time() -a)/60 ,'min')

MLP:  0.8069911255672008 time:  25.118510178724925 min


In [34]:
# xgboost on Tfidf Vectors
a = time.time()
accuracy = train_model(xgboost.XGBClassifier(),"tfidf",X_train.tocsc(),X_test.tocsc(), y_train,y_test)
print ("xgboost: Counting Vectors: ", accuracy, 'time: ', (time.time() -a)/60 ,'min')

xgboost:  0.7120427257760109 time:  3.560242553551992 min


In [29]:
rf = load('RandomForestClassifier_tfidf.joblib')
nb = load('MultinomialNB_count.joblib')
lg = load('LogisticRegression_tfidf.joblib')
mlp = load('MLPClassifier_tfidf.joblib')
dt = load('DecisionTreeClassifier_tfidf.joblib')
xgb = load('XGBClassifier_tfidf.joblib')

In [30]:
rf_pred = rf.predict(X_test)
nb_pred = nb.predict(X_test)
lg_pred = lg.predict(X_test)
mlp_pred = mlp.predict(X_test)
dt_pred = dt.predict(X_test)
xgb_pred = xgb.predict(X_test)

In [31]:
print( "rf",metrics.accuracy_score(rf_pred, y_test),classification_report(y_test,rf_pred), sep = '\n')
print( "nb",metrics.accuracy_score(nb_pred, y_test),classification_report(y_test,nb_pred),sep = '\n')
print( "lg",metrics.accuracy_score(lg_pred, y_test),classification_report(y_test,lg_pred),sep = '\n')
print( "dt",metrics.accuracy_score(dt_pred, y_test),classification_report(y_test,dt_pred),sep = '\n')
print( "mlp",metrics.accuracy_score(mlp_pred, y_test),classification_report(y_test,mlp_pred),sep = '\n')
print( "xgb",metrics.accuracy_score(xgb_pred, y_test),classification_report(y_test,xgb_pred),sep = '\n')

rf
0.7888206240212022
              precision    recall  f1-score   support

           0       0.96      0.64      0.77      7679
           1       0.75      0.92      0.83     22056
           2       0.77      0.76      0.77     15917
           3       1.00      0.46      0.63      3971
           4       1.00      0.56      0.72       183

    accuracy                           0.79     49806
   macro avg       0.90      0.67      0.74     49806
weighted avg       0.81      0.79      0.78     49806

nb
0.6332369594024816
              precision    recall  f1-score   support

           0       0.96      0.24      0.39      7679
           1       0.62      0.89      0.73     22056
           2       0.62      0.63      0.62     15917
           3       0.97      0.02      0.04      3971
           4       0.00      0.00      0.00       183

    accuracy                           0.63     49806
   macro avg       0.63      0.36      0.36     49806
weighted avg       0.70      0.63

  _warn_prf(average, modifier, msg_start, len(result))


dt
0.7387664136850982
              precision    recall  f1-score   support

           0       0.73      0.72      0.72      7679
           1       0.76      0.79      0.77     22056
           2       0.73      0.72      0.72     15917
           3       0.66      0.59      0.62      3971
           4       0.66      0.58      0.62       183

    accuracy                           0.74     49806
   macro avg       0.71      0.68      0.69     49806
weighted avg       0.74      0.74      0.74     49806

mlp
0.8069911255672008
              precision    recall  f1-score   support

           0       0.81      0.83      0.82      7679
           1       0.82      0.84      0.83     22056
           2       0.80      0.77      0.79     15917
           3       0.73      0.73      0.73      3971
           4       0.86      0.63      0.73       183

    accuracy                           0.81     49806
   macro avg       0.80      0.76      0.78     49806
weighted avg       0.81      0.8