In [7]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.metrics import classification_report
import scipy.sparse
import time
import pickle
from joblib import dump, load

from scipy import sparse


import pandas as pd
import numpy as np
import xgboost, textblob, string

In [8]:
data = pd.read_csv('winemag-data_first150k.csv', index_col=0)
#data.head()

In [9]:
conditions = [
    (data['points'] <= 84),
    (data['points'] > 84) & (data['points'] <= 88),
    (data['points'] > 88) & (data['points'] <= 92),
    (data['points'] > 92) & (data['points'] <= 96),
    (data['points'] > 96) & (data['points'] <= 100)
    ]

# create a list of the values we want to assign for each condition
class_list = [0,1,2,3,4]#['Low', 'OK', 'Good', 'Very Good', 'Excellent']

# create a new column and use np.select to assign values to it using our lists as arguments
data['class'] = np.select(conditions, class_list)

data = data[['description','winery', 'country','class']] # selecting columns
data = data.dropna() # deleting na rows
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150925 entries, 0 to 150929
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   description  150925 non-null  object
 1   winery       150925 non-null  object
 2   country      150925 non-null  object
 3   class        150925 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 5.8+ MB


In [10]:
data["united"] = data["country"] + " " + data['winery'] + " " + data["description"]

In [11]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(data['united'],
                                                                    data['class'],
                                                                    test_size=0.33,
                                                                    random_state=42)

In [12]:
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

In [13]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(data['united'])

# transform the training and validation data using count vectorizer object
X_train =  count_vect.transform(X_train)
X_test =  count_vect.transform(X_test)

In [21]:
def train_model(classifier,as_features,X_train,X_test, y_train,y_test):
    # fit the training dataset on the classifier
    classifier.fit(X_train, y_train)
    
    #dump(classifier,  type(classifier).__name__ + "_"+ as_features+'.joblib') 
    # predict the labels on validation dataset
    predictions = classifier.predict(X_test)
    
    return metrics.accuracy_score(predictions, y_test)

In [22]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(),"count",X_train, X_test, y_train,y_test)
print ("Naive Bayes:", accuracy)

Naive Bayes: 0.66857406738144


In [23]:
# RF on Count Vectors
a = time.time()
accuracy = train_model(ensemble.RandomForestClassifier(n_jobs = -1),"count",X_train,X_test, y_train,y_test)
print ("Random Forest: ", accuracy, 'time: ', (time.time() -a)/60 ,'min')

Random Forest:  0.7886800787053768 time:  5.061360418796539 min


In [24]:
# DT on Count Vectors
a = time.time()
from sklearn.tree import DecisionTreeClassifier
accuracy = train_model(DecisionTreeClassifier(criterion='gini' ),"count",X_train,X_test, y_train,y_test)
print ("Decision Tree:: ", accuracy, 'time: ', (time.time() -a)/60 ,'min')

Decision Tree::  0.7418182548287355 time:  2.834700604279836 min


In [25]:
# LG on Count Vectors
a = time.time()
accuracy = train_model(linear_model.LogisticRegression(max_iter = 500),"count",X_train,X_test, y_train,y_test)
print ("Logistic Regression: ", accuracy, 'time: ', (time.time() -a)/60 ,'min')

Logistic Regression:  0.772396899971891 time:  2.0053375482559206 min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
# MLP on Count Vectors
#a = time.time()
#accuracy = train_model(MLPClassifier(), "count",X_train,X_test, y_train,y_test)
#print ("MLP: Counting Vectors: ", accuracy, 'time: ', (time.time() -a)/60 ,'min')



MLP: Counting Vectors:  0.8131148857567362 time:  39.944799693425495 min


In [27]:
# xgboost on Count Vectors
a = time.time()
accuracy = train_model(xgboost.XGBClassifier(),"count",X_train.tocsc(),X_test.tocsc(), y_train,y_test)
print ("xgboost: Counting Vectors: ", accuracy, 'time: ', (time.time() -a)/60 ,'min')

xgboost: Counting Vectors:  0.6978074930731237 time:  1.4257739265759786 min


In [17]:
rf = load('RandomForestClassifier_count.joblib')
nb = load('MultinomialNB_count.joblib')
lg = load('LogisticRegression_count.joblib')
mlp = load('MLPClassifier_count.joblib')
dt = load('DecisionTreeClassifier_count.joblib')
xgb = load('XGBClassifier_count.joblib')

In [18]:
rf_pred = rf.predict(X_test)
nb_pred = nb.predict(X_test)
lg_pred = lg.predict(X_test)
mlp_pred = mlp.predict(X_test)
dt_pred = dt.predict(X_test)
xgb_pred = xgb.predict(X_test)

In [19]:
print( "rf",metrics.accuracy_score(rf_pred, y_test),classification_report(y_test,rf_pred), sep = '\n')
print( "nb",metrics.accuracy_score(nb_pred, y_test),classification_report(y_test,nb_pred),sep = '\n')
print( "lg",metrics.accuracy_score(lg_pred, y_test),classification_report(y_test,lg_pred),sep = '\n')
print( "dt",metrics.accuracy_score(dt_pred, y_test),classification_report(y_test,dt_pred),sep = '\n')
print( "mlp",metrics.accuracy_score(mlp_pred, y_test),classification_report(y_test,mlp_pred),sep = '\n')
print( "xgb",metrics.accuracy_score(xgb_pred, y_test),classification_report(y_test,xgb_pred),sep = '\n')

rf
0.7886800787053768
              precision    recall  f1-score   support

           0       0.96      0.66      0.78      7679
           1       0.74      0.94      0.83     22056
           2       0.80      0.72      0.76     15917
           3       1.00      0.46      0.63      3971
           4       1.00      0.56      0.72       183

    accuracy                           0.79     49806
   macro avg       0.90      0.67      0.74     49806
weighted avg       0.81      0.79      0.78     49806

nb
0.66857406738144
              precision    recall  f1-score   support

           0       0.67      0.68      0.67      7679
           1       0.72      0.68      0.70     22056
           2       0.64      0.68      0.66     15917
           3       0.54      0.56      0.55      3971
           4       0.00      0.00      0.00       183

    accuracy                           0.67     49806
   macro avg       0.51      0.52      0.52     49806
weighted avg       0.67      0.67  