In [None]:
import json
import logging
import spacy
import html
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from time import time
from pprint import pprint
from tabulate import tabulate
from sklearn.externals import joblib
from sklearn.preprocessing import label_binarize
from sklearn.feature_extraction.text import CountVectorizer,
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

In [None]:
# NOTE: Make sure to remove non-printable ascii characters using:
# tr -cd '\11\12\15\40-\176' < file-with-binary-chars > clean-file

def load_data(train_file, test_file):
    train_data = [json.loads(line) for line in open(train_file).readlines()]
    test_data = [json.loads(line) for line in open(test_file).readlines()]

    train = {}
    train['review'] = [html.unescape(sample['reviewText']) for sample in train_data]
    train['summary'] = [html.unescape(sample['summary']) for sample in train_data]
    train['rating'] = np.array([sample['overall'] for sample in train_data])

    test = {}
    test['review'] = [html.unescape(sample['reviewText']) for sample in test_data]
    test['summary'] = [html.unescape(sample['summary']) for sample in test_data]
    test['rating'] = np.array([sample['overall'] for sample in test_data])

    classes = np.array([-1, 0, 1])

    def target(rating):
        if rating <= 2:
            return classes[0]
        elif rating == 3:
            return classes[1]
        else:
            return classes[2]
    train['target'] = np.array([target(rating) for rating in train['rating']])
    test['target'] = np.array([target(rating) for rating in test['rating']])

    return train, test, classes

def load_preprocessed_data(datafile):
    samples = [line.split('\t') for line in open(datafile).readlines()]
    samples = [sample for sample in samples if len(sample) == 3]
    X = pd.DataFrame({
        'review': [sample[0] for sample in samples],
        'summary': [sample[1] for sample in samples]})
    Y = [int(sample[2].strip()) for sample in samples]

    return X, Y

In [None]:
train, test, classes = load_data('audio_train.json', 'audio_dev.json')
trainX, trainY = load_preprocessed_data('train_tok_clean.txt')
testX, testY = load_preprocessed_data('test_tok_clean.txt')

In [None]:
# Define vectorizers
class SpacyCountVectorizer(CountVectorizer):
    def __init__(self, lowercase=True, ngram_range=(1,1), binary=False, vocabulary=None,
                 max_features=None, max_df=1.0, min_df=1, pos=True):
        super(SpacyCountVectorizer, self).__init__(lowercase=lowercase, ngram_range=ngram_range, binary=binary, vocabulary=vocabulary,
                                                   max_features=max_features, max_df=max_df, min_df=min_df)
        self.pos = pos
    def tokenize(self, doc):
        if doc == '':
            return []
        if self.pos:
            return doc.split('  ')
        else:
            return [tok.split(':|:')[0] for feat in features]
    def build_tokenizer(self):
        return lambda doc: self.tokenize(doc)

class ReviewExtractor(object):
    def transform(self, X):
        return X['review']
    def fit(self, X, y=None):
        return self

class SummaryExtractor(object):
    def transform(self, X):
        return X['summ']
    def fit(self, X, y=None):
        return self

In [None]:
rev_vectorizer = SpacyCountVectorizer(ngram_range=(1,2), binary=True, max_df=0.8, min_df=5e-6)
train_rev_feat = rev_vectorizer.fit_transform(trainX['review'])
test_rev_feat = rev_vectorizer.transform(testX['review'])

summ_vectorizer = SpacyCountVectorizer(ngram_range=(1,2), binary=True, max_df=0.8, min_df=5e-6)
train_summ_feat = summ_vectorizer.fit_transform(trainX['summary'])
test_summ_feat = summ_vectorizer.transform(testX['summary'])

In [None]:
train_feat1 = hstack([train_rev_feat, 3 * train_summ_feat])
test_feat1 = hstack([test_rev_feat, 3 * test_summ_feat])

In [None]:
# set features
train_feat = train_feat1; test_feat = test_feat1
print(train_feat.shape)
print(test_feat.shape)

In [None]:
def display_results(preds, dev_target):
    n_correct = (preds == dev_target).sum()
    print("accuracy={:4.2f} ({}/{})".format(n_correct/preds.shape[0] * 100, n_correct, preds.shape[0]))
    print(confusion_matrix(dev_target, preds, labels=[-1, 0, 1]))
    print("macro-F1={:4.4f}".format(f1_score(dev_target, preds, labels=[-1, 0, 1], average='macro')))
    
def evaluate_feat_MNB(train_feat, train_target, dev_feat, dev_target, class_prior=None):
    nb = MultinomialNB()
    nb.fit(train_feat, train_target)
    preds = nb.predict(dev_feat)
    display_results(preds, dev_target)
    return nb

def evaluate_feat_LR(train_feat, train_target, dev_feat, dev_target):
    clf = LogisticRegression(penalty="l2", C=1, solver='liblinear', multi_class='ovr', n_jobs=3, 
                             random_state=2324,max_iter=50, class_weight="balanced")
    clf.fit(train_feat, train_target)
    preds = clf.predict(dev_feat)
    display_results(preds, dev_target)
    return clf

def evaluate_feat_SGD(train_feat, train_target, dev_feat, dev_target, loss='hinge', 
                      penalty='l2', max_iter=None, average=True, alpha=0.0001, tol=1e-3):
    clf = SGDClassifier(loss=loss, penalty=penalty, random_state=2324, max_iter=max_iter, 
                        class_weight="balanced", average=average, alpha=alpha, tol=tol)
    clf.fit(train_feat, train_target)
    preds = clf.predict(dev_feat)
    display_results(preds, dev_target)
    return clf

def evaluate_feat_SVM(train_feat, train_target, dev_feat, dev_target):
    clf = LinearSVC(penalty="l2", max_iter=50, random_state=2324, 
                    class_weight="balanced", tol=1e-3)
    clf.fit(train_feat, train_target)
    preds = clf.predict(dev_feat)
    display_results(preds, dev_target)
    return clf

In [None]:
# nb = evaluate_feat_MNB(train_feat, trainY, test_feat, testY)
# lr = evaluate_feat_LR(train_feat, trainY, test_feat, testY)
# lr = evaluate_feat_SGD(train_feat, trainY, test_feat, testY, loss='log')
# svm2 = evaluate_feat_SVM(train_feat, trainY, test_feat, testY)
svm1 = evaluate_feat_SGD(train_feat, trainY, test_feat, testY, loss='hinge',penalty='l2', average=True, alpha=1e-4, tol=1e-5)

In [None]:
joblib.dump(svm1, '/home/cse/dual/cs5130298/scratch/NLP/1/models/5.pkl')