In [1]:
# corpus reader is called by PickedReviewFunction below
import pickle
import sqlite3

from nltk.corpus.reader.api import CorpusReader

PKL_PATTERN = r'(?!\.)[\w\s\d\-]+\.pickle'

class SqliteCorpusReader(object):

    def __init__(self, path):
        self._cur = sqlite3.connect(path).cursor()
        import nltk

    def scores(self):
        """
                Returns the review score
                """
        self._cur.execute("SELECT score FROM reviews")
        scores = self._cur.fetchall()
        for score in scores:
            yield score


    def texts(self):
        """
        Returns the full review texts
        """
        self._cur.execute("SELECT content FROM content")
        texts = self._cur.fetchall()
        for text in texts:
            yield text

    def ids(self):
        """
        Returns the review ids
        """
        self._cur.execute("SELECT reviewid FROM content")
        ids = self._cur.fetchall()
        for idx in ids:
            yield idx

    def ids_and_texts(self):
        """
        Returns the review ids
        """
        self._cur.execute("SELECT * FROM content")
        results = self._cur.fetchall()
        for idx,text in results:
            yield idx,text

    def scores_albums_artists_texts(self):
        """
        Returns a generator with each review represented as a
        (score, album name, artist name, review text) tuple
        """
        sql = """
              SELECT S.score, L.label, A.artist, R.content
              FROM [reviews] S
              JOIN labels L ON S.reviewid=L.reviewid
              JOIN artists A on L.reviewid=A.reviewid
              JOIN content R ON A.reviewid=R.reviewid
              """
        self._cur.execute(sql)
        results = self._cur.fetchall()
        for score,album,band,text in results:
            yield (score,album,band,text)

    def albums(self):
        """
        Returns the names of albums being reviewed
        """
        self._cur.execute("SELECT * FROM labels")
        albums = self._cur.fetchall()
        for idx,album in albums:
            yield idx,album

    def artists(self):
        """
        Returns the name of the artist being reviewed
        """
        self._cur.execute("SELECT * FROM artists")
        artists = self._cur.fetchall()
        for idx,artist in artists:
            yield idx,artist

    def genres(self):
        """
        Returns the music genre of each review
        """
        self._cur.execute("SELECT * FROM genres")
        genres = self._cur.fetchall()
        for idx,genre in genres:
            yield idx,genre

    def years(self):
        """
        Returns the publication year of each review
        Note: There are many missing values
        """
        self._cur.execute("SELECT * FROM years")
        years = self._cur.fetchall()
        for idx,year in years:
            yield idx,year

    def paras(self):
        """
        Returns a generator of paragraphs.
        """
        for text in self.texts():
            for paragraph in text:
                yield paragraph

    def sents(self):
        """
        Returns a generator of sentences.
        """
        for para in self.paras():
            for sentence in nltk.sent_tokenize(para):
                yield sentence

    def words(self):
        """
        Returns a generator of words.
        """
        for sent in self.sents():
            for word in nltk.wordpunct_tokenize(sent):
                yield word

    def tagged_tokens(self):
        for sent in self.sents():
            for word in nltk.wordpunct_tokenize(sent):
                yield nltk.pos_tag(word)


In [2]:
# copied picked reviews reader from github
class PickledReviewsReader(CorpusReader):
    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader
        """
        CorpusReader.__init__(self, root, fileids, **kwargs)

    def texts_scores(self, fileids=None):
        """
        Returns the document loaded from a pickled object for every file in
        the corpus. Similar to the SqliteCorpusReader, this uses a generator
        to achieve memory safe iteration.
        """
        # Create a generator, loading one document into memory at a time.
        for path, enc, fileid in self.abspaths(fileids, True, True):
            with open(path, 'rb') as f:
                yield pickle.load(f)

    def reviews(self, fileids=None):
        """
        Returns a generator of paragraphs where each paragraph is a list of
        sentences, which is in turn a list of (token, tag) tuples.
        """
        for text,score in self.texts_scores(fileids):
            yield text

    def scores(self, fileids=None):
        """
        Return the scores
        """
        for text,score in self.texts_scores(fileids):
            yield score

    def paras(self, fileids=None):
        """
        Returns a generator of paragraphs where each paragraph is a list of
        sentences, which is in turn a list of (token, tag) tuples.
        """
        for review in self.reviews(fileids):
            for paragraph in review:
                yield paragraph

    def sents(self, fileids=None):
        """
        Returns a generator of sentences where each sentence is a list of
        (token, tag) tuples.
        """
        for paragraph in self.paras(fileids):
            # iterate through each sentence
            for sentence in paragraph:
                yield sentence

    def tagged(self, fileids=None):
        for sent in self.sents(fileids):
            for token in sent:
                yield token

    def words(self, fileids=None):
        """
        Returns a generator of (token, tag) tuples.
        """
        for token in self.tagged(fileids):
            yield token[0]


In [3]:
# import libraries
import numpy as np
# imported joblib directly as sklearn.externals joblib is being deprecated
import joblib 
from sklearn.externals import joblib



# function to return list from corpus reviews 
def documents(corpus):
    return list(corpus.reviews())

# function to return scores for continuous value
def continuous(corpus):
    return list(corpus.scores())

# function to return scores for categorical value
def make_categories(corpus):
    """
    terrible : 0.0 < y <= 3.0
    okay : 3.0 < y <= 5.0
    great : 5.0 < y <= 7.0
    amazing : 7.0 < y <= 10.01
    
    """
    return np.digitize(continuous(corpus), [0.0, 3.0, 5.0, 7.0, 10.1])






In [4]:
def train_model(path, model, continuous=True, saveto=None, cv=12):
    '''
    
    trains model from corpus at specified path, constructing cross-validation scores using the cv parameter,
    then fitting the model on the full data.  Returns the scores.
    
    '''

    # load the corpus data and labels for classification
    # PickledReviews Reader from reader library no longer exist so used script for PickledReviewsReader
    # corpus = PickledReviewsReader(path)
    corpus = PickledReviewsReader(path)
    

    # set X (training data) to corpus
    X = documents(corpus)
    # what does if continuous do?
    if continuous:
        # run continuous function 
        y = continuous(corpus)
        # set scoring to r2_score
        scoring = 'r2_score'
    else:
        # run categorical function
        y = make_categorical(corpus)
        # set scoring to f1_score
        scoring = 'f1_score'
        
    # compute cross-validation scores
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    
    # write to disk if specified
    if saveto:
        joblib.dump(model, saveto)
        
    # fit the model on entire dataset
    model.fit(X, y)
    
    # return scores
    return scores
    


In [5]:
# import libraries
from sklearn.externals import joblib
from sklearn.model_selection import cross_val_score
import pandas as pd

if __name__ == '__main__':
    # TextNormalizer from transformer and transformers attempted, but don't exit
    # from transformers import TextNormalizer
    from sklearn.preprocessing import Normalizer

    # reader for PickledReviewsReader no longer seems to exist, so loaded PickledReviewsReader script 
    # from reader import PickledReviewsReader
    # also tried importing pickle and keras as alternatives to above
    import pickle
    # import keras
    
    # import sklearn libraries
    from sklearn.pipeline import Pipeline
    from sklearn.neural_network import MLPRegressor, MLPClassifier
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    # path to categorized comments
    cpath = pd.read_json('categorized-comments.jsonl', lines=True, encoding='utf8')
    
    
    
    regressor = Pipeline([
        ('norm', Normalizer()),
        ('tfidf', TfidfVectorizer()),
        ('ann', MLPRegressor(hidden_layer_sizes=[500, 150], verbose=True))
    ])
    # run train model using regressor
    regression_scores = train_model(cpath, regressor, continuous=True)
        
   
    classifier = Pipeline([
        ('norm', Normalizer()),
        ('tfidf', TfidfVectorizer()),
        ('ann', MLPClassifier(hidden_layer_sizes=[500, 150], verbose=True))
    ])     
    
    # run train model using classifier   
    classifier_scores = train_model(cpath, classifier, continuous=False)
         
    

TypeError: CorpusReader: expected a string or a PathPointer

In [None]:
# import libraries
from sklearn.linear_model import LogisticRegression
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ROCAUC

# Instantiate the classification model 
model = LogisticRegression()

# define classes for confusion matrix
classes = ['Approved','Not Approved']

# define confusion matrix with logistic regression model and classes
cm = ConfusionMatrix(model, classes=classes, percent=False)

#Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm.fit(X, y)

#To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
#and then creates the confusion_matrix from scikit learn.
cm.score(X, y)

# change fontsize of the labels in the figure
for label in cm.ax.texts:
    label.set_size(20)

#How did we do?
cm.poof()

In [None]:
# import libraries
from sklearn.linear_model import LogisticRegression
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ROCAUC
import matplotlib.pyplot as plt

# Precision, Recall, and F1 Score
# Using classification report
#%matplotlib inline

# set the size of the figure and the font size 
plt.rcParams['figure.figsize'] = (15, 7)
plt.rcParams['font.size'] = 20

# Instantiate the visualizer
visualizer = ClassificationReport(model, classes=classes)

# Fit the training data to the visualizer
visualizer.fit(features_train, target_train)  
# Evaluate the model on the test data
visualizer.score(features_test, target_test)  

# print results
g = visualizer.poof()
print(g)


In [None]:
from keras.layers import Dense
from keras.models import Sequential
from keras import utils


N_FEATURES = 5000
N_CLASSES = 4


def build_network():
    '''
    create a function that returns a compiled neural network
    
    '''
    
    
    nn= Sequential()
    nn.add(Dense(500, activation='relu', input_shape=N_FEATURES, ))
    nn.add(Dense(150, activation='relu'))
    nn.add(Dense(N_CLASSES, activation='softmax'))
    nn.compile(
        loss = 'categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']    
    )
    return nn



In [None]:
def train_model(path, model, saveto=None, cv=12):
    '''
    
    trains model from corpus at specified path and fits on full data.
    if a saveto dictionary is specified, writes keras and sklearn pipeline components to disk separately.
    return the scores.
    
    '''

    corpus = PickledReviewsReader(path)
    X = documents(corpus)
    y = to_categorical(corpus)
    
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
    # print(scores)
    model.fit(X, y)
    
    if saveto:
        model.steps[-1][1].model.save(saveto['keras_model'])
        model.steps(pop(-1))
        joblib.dump(model, saveto['sklearn_pipe'])
        
    return scores
    
    
    

In [None]:
if __name__ == '__main__':
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import Normalizer
    # from transformer import TextNormalizer
    from keras.wrappers.scikit_learn import KerasClassifier
    from keras.utils import to_categorical

    from sklearn.feature_extraction.text import TfidfVectorizer
    from keras import utils

    
    pipeline = Pipeline([
        ('norm', Normalizer()),
        ('vect', TfidfVectorizer(max_features=N_FEATURES)),
        ('nn', KerasClassifier(build_fn=build_network,
                              epochs=200,
                              batch_size=128))
    ])
    
    

In [None]:
cpath = 'categorized-comments.csv'
mpath = {
    'keras_model' : 'keras_nn_h5', 
    'sklearn_pipe' : 'pipeline.pkl'
}
scores = train_model(cpath, pipeline, saveto=mpath, cv=12)


In [None]:
# import libraries
from sklearn.linear_model import LogisticRegression
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ROCAUC

# Instantiate the classification model 
model = LogisticRegression()

# define classes for confusion matrix
classes = ['Approved','Not Approved']

# define confusion matrix with logistic regression model and classes
cm = ConfusionMatrix(model, classes=classes, percent=False)

#Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm.fit(features_train, target_train)

#To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
#and then creates the confusion_matrix from scikit learn.
cm.score(features_test, target_test)

# change fontsize of the labels in the figure
for label in cm.ax.texts:
    label.set_size(20)

#How did we do?
cm.poof()

In [None]:
# import libraries
from sklearn.linear_model import LogisticRegression
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ROCAUC
import matplotlib.pyplot as plt

# Precision, Recall, and F1 Score
# Using classification report
#%matplotlib inline

# set the size of the figure and the font size 
plt.rcParams['figure.figsize'] = (15, 7)
plt.rcParams['font.size'] = 20

# Instantiate the visualizer
visualizer = ClassificationReport(model, classes=classes)

# Fit the training data to the visualizer
visualizer.fit(features_train, target_train)  
# Evaluate the model on the test data
visualizer.score(features_test, target_test)  

# print results
g = visualizer.poof()
print(g)


In [None]:
# import libraries
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.utils import np_utils
from keras import backend as K


# set that the color channel value will be first
K.set_image_data_format('channels_first')

# set seed
np.random.seed(0)

# set image information
channels = 1
height = 28
width = 28

# load data and target from MNIST data
(data_train, target_train), (data_test, target_test) = mnist.load_data()

# reshape training image data into features
data_train = data_train.reshape(data_train.shape[0], channels, height, width)

# reshape test image into features
data_test = data_test.reshape(data_test.shape[0], channels, height, width)

# rescale pixel intensity
features_train = data_train/255
features_test = data_test/255

# one-hot encode target
target_train = np_utils.to_categorical(target_train)
target_test = np_utils.to_categorical(target_test)
number_of_classes = target_test.shape[1]

# start neural network
network = Sequential()

# add convolutuonal layer with 64 filters, a 5x5 window, and ReLu activation function
network.add(Conv2D(filters=64,
                   kernel_size=(5,5),
                   input_shape=(channels, width, height),
                   activation='relu'))



# add max pooling layer with 2x2 window
network.add(MaxPooling2D(pool_size=(2, 2)))

# add dropout layer
network.add(Dropout(0.5))

# add layer to flatten input
network.add(Flatten())


# add fully connected layer of 128 units with a ReLU activation function
network.add(Dense(128, activation = 'relu'))

# add dropout layer
network.add(Dropout(0.5))

# add fully connected layer with a softmax activation function
network.add(Dense(number_of_classes, activation='softmax'))

# compile neural network
network.compile(loss='categorical_crossentropy',  # cross-entropy
                optimizer='rmsprop',  # root mean square propogation
                metrics=['accuracy'])  # accuracy performance metric

# train neural network
network.fit(features_train,  # features
           target_train,  # target
           epochs=2,  # number of epochs
           verbose=0,  # don't print description after each epoch
           batch_size=1000,  # # of observations per batch
           validation_data=(features_test, target_test))  # data for evaluation



In [None]:
# import libraries
from sklearn.linear_model import LogisticRegression
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ROCAUC

# Instantiate the classification model 
model = LogisticRegression()

# define classes for confusion matrix
classes = ['Approved','Not Approved']

# define confusion matrix with logistic regression model and classes
cm = ConfusionMatrix(model, classes=classes, percent=False)

#Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm.fit(features_train, target_train)

#To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
#and then creates the confusion_matrix from scikit learn.
cm.score(features_test, target_test)

# change fontsize of the labels in the figure
for label in cm.ax.texts:
    label.set_size(20)

#How did we do?
cm.poof()

In [None]:
# import libraries
from sklearn.linear_model import LogisticRegression
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ROCAUC
import matplotlib.pyplot as plt

# Precision, Recall, and F1 Score
# Using classification report
#%matplotlib inline

# set the size of the figure and the font size 
plt.rcParams['figure.figsize'] = (15, 7)
plt.rcParams['font.size'] = 20

# Instantiate the visualizer
visualizer = ClassificationReport(model, classes=classes)

# Fit the training data to the visualizer
visualizer.fit(features_train, target_train)  
# Evaluate the model on the test data
visualizer.score(features_test, target_test)  

# print results
g = visualizer.poof()
print(g)