In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
# https://towardsdatascience.com/basic-binary-sentiment-analysis-using-nltk-c94ba17ae386

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import nltk
import random
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.text import Text
from nltk.corpus import stopwords
import re

from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
wine = pd.read_csv("../input/wine-reviews/winemag-data-130k-v2.csv")
wine.head()
# Wednesday: categorical -> numerical, no description, no designation, points normalize,
# price separate into target table, no nulls, clean

In [None]:
def remove_punctuation(sentence):
    sentence = re.sub(r'[^\w\s]', "", sentence)
    return sentence
for idx, row in wine.iterrows():
    desc = row["description"]
    sentences = sent_tokenize(desc)
    sentences_clean = [remove_punctuation(sent) for sent in sentences]
    row["clean sentences"] = sentences_clean
    row["words"] = [word_tokenize(sent) for sent in sentences_clean]

In [None]:
stopwords = list(set(stopwords.words("english")))

def remove_stopword(sentence):
    return [w for w in sentence if w not in stopwords]

for idx, row in wine.iterrows():
    row["filtered words"] = [remove_stopword(w) for w in row["clean sentences"]]

In [None]:
wine = wine.drop(columns = ["Unnamed: 0", "designation", "taster_name", "taster_twitter_handle"])

In [None]:
wine = wine.dropna()

In [None]:
all_words = []
reviews = []
pos = ["J"]

for idx, row in wine.iterrows():
    # create a list of tuples where the first element of each tuple is a review
    # the second element is the label
    reviews.append((row["cleaned sentences"], "pos"))
    
#     # remove punctuations
#     cleaned = re.sub(r'[^(a-zA-Z)\s]','', p)
    
#     # tokenize 
#     tokenized = word_tokenize(cleaned)
    
#     # remove stopwords 
#     stopped = [w for w in tokenized if not w in stop_words]
    
    # parts of speech tagging for each word 
    pos_tag = nltk.pos_tag(row["filtered words"])
    
    # make a list of  all adjectives identified by the allowed word types list above
    for w in pos_tag:
        if w[1][0] in pos:
            all_words.append(w[0].lower())

In [None]:
# creating a frequency distribution of each adjectives.
all_words = nltk.FreqDist(all_words)

# listing the 5000 most frequent words
word_features = list(all_words.keys())[:5000]

# function to create a dictionary of features for each review in the list document.
# The keys are the words in word_features 
# The values of each key are either true or false for wether that feature appears in the review or not

def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

# Creating features for each review
featuresets = [(find_features(rev), category) for (rev, category) in reviews]

# Shuffling the documents 
random.shuffle(featuresets)

training = featuresets[:20000]
testing = featuresets[20000:]

In [None]:
classifier = nltk.NaiveBayesClassifier.train(training)

print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing)) * 100)

classifier.show_most_informative_features(20)

In [None]:
f1_score(ground_truth, preds, labels = ["neg", "pos"], average = "micro")

In [None]:
# training various models by passing in the sklearn models into the SklearnClassifier from NLTK 

MNB_clf = SklearnClassifier(MultinomialNB())
MNB_clf.train(training)

BNB_clf = SklearnClassifier(BernoulliNB())
BNB_clf.train(training)

LogReg_clf = SklearnClassifier(LogisticRegression())
LogReg_clf.train(training)

SGD_clf = SklearnClassifier(SGDClassifier())
SGD_clf.train(training)

SVC_clf = SklearnClassifier(SVC())
SVC_clf.train(training)

In [None]:
ground_truth = [r[1] for r in testing]
predictions = {}
f1_scores = {}
f
or clf, listy in classifiers_dict.items(): 
    # getting predictions for the testing set by looping over each reviews featureset tuple
    # The first elemnt of the tuple is the feature set and the second element is the label 
    predictions[clf] = [listy[0].classify(r[0]) for r in testing]
    f1_scores[clf] = f1_score(ground_truth, predictions[clf])
    print(f'f1_score {clf}: {f1_scores[clf]}')

In [None]:
from nltk.classify import ClassifierI

# Defininig the ensemble model class 

class EnsembleClassifier(ClassifierI):
    
    def __init__(self, *classifiers):
        self._classifiers = classifiers
    
    # returns the classification based on majority of votes
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    # a simple measurement the degree of confidence in the classification 
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [None]:
# Load all classifiers from the pickled files 

# function to load models given filepath
def load_model(file_path): 
    classifier_f = open(file_path, "rb")
    classifier = pickle.load(classifier_f)
    classifier_f.close()
    return classifier

# Original Naive Bayes Classifier
ONB_Clf = load_model('pickled_algos/ONB_clf.pickle')

# Multinomial Naive Bayes Classifier 
MNB_Clf = load_model('pickled_algos/MNB_clf.pickle')


# Bernoulli  Naive Bayes Classifier 
BNB_Clf = load_model('pickled_algos/BNB_clf.pickle')

# Logistic Regression Classifier 
LogReg_Clf = load_model('pickled_algos/LogReg_clf.pickle')

# Stochastic Gradient Descent Classifier
SGD_Clf = load_model('pickled_algos/SGD_clf.pickle')

# Initializing the ensemble classifier 
ensemble_clf = EnsembleClassifier(ONB_Clf, MNB_Clf, BNB_Clf, LogReg_Clf, SGD_Clf)

# List of only feature dictionary from the featureset list of tuples 
feature_list = [f[0] for f in testing_set]

# Looping over each to classify each review
ensemble_preds = [ensemble_clf.classify(features) for features in feature_list]

In [None]:
# Function to do classification a given review and return the label a
# and the amount of confidence in the classifications
def sentiment(text):
    feats = find_features(text)
    return ensemble_clf.classify(feats), ensemble_clf.confidence(feats)

In [None]:
### LDA ###
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
import os
import re
import tensorflow as tf
import sqlite3
from sqlite3 import Error
import wordcloud
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import warnings
import pickle 
import pyLDAvis
from pyLDAvis import sklearn as sklearn_lda

wine = pd.read_csv("../input/wine-reviews/winemag-data-130k-v2.csv")
wine.head()

In [None]:
desc = wine["description"]
desc = desc.map(lambda x: x.lower())

In [None]:
desc_string = ",".join(list(desc.values))

wordcloud = WordCloud(background_color = "white", max_words = 5000, contour_width = 3, contour_color = 'steelblue')
wordcloud.generate(desc_string)
wordcloud.to_image()

In [None]:
def plot_most_common_words(count_data, count_vectorizer, num):
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts += t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key = lambda x: x[1], reverse = True)[0:num]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize = (15, 15/1.6180))
    plt.subplot(title = "{0} most common words".format(num))
    sns.set_context("notebook", font_scale = 1.25, rc = {"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette = 'husl')
    plt.xticks(x_pos, words, rotation = 90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()

count_vectorizer = CountVectorizer(stop_words = 'english')
count_data = count_vectorizer.fit_transform(desc)
plot_most_common_words(count_data, count_vectorizer, 10)

In [None]:
warnings.simplefilter("ignore", DeprecationWarning)

def print_topics(model, count_vectorizer, n):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i] for i in topic.argsort()[:-n - 1:-1]]))
        
num_topics = 5
num_words = 10

lda = LDA(n_components = num_topics, n_jobs = -1)
lda.fit(count_data)

print("Topics found via LDA:")
print_topics(lda, count_vectorizer, num_words)