# Anomaly Detection

In [1]:
# NLTK
import nltk
from collections import Counter
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('brown')
# nltk.download('names')

# Tokeenization
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.corpus import wordnet
from nltk import pos_tag, pos_tag_sents
from nltk.stem import WordNetLemmatizer

# Vectorizer
from io import StringIO
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Utilities
from collections import OrderedDict
import pickle
from tqdm import tqdm
import operator 

# Plotting
from matplotlib import pyplot as plt

# Classifier
from sklearn.naive_bayes import MultinomialNB

# Word Embedding
from gensim.models.keyedvectors import KeyedVectors

# Spell check
from spellchecker import SpellChecker

import re
import gc

# Sklearn
from sklearn.model_selection import train_test_split, KFold,StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import GaussianNB, MultinomialNB

from gensim.matutils import unitvec

import logging
import normalise

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)




In [2]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence

Using TensorFlow backend.


In [3]:
tqdm.pandas("Progress")

# 1. load dataset and split training set

In [4]:
dataset = pd.read_csv("dataset/train.csv")
# testing_set = pd.read_csv("dataset/test.csv")

In [5]:
len(dataset)

1306122

In [6]:
pos = dataset[dataset['target'] == 0]
neg = dataset[dataset['target'] == 1]
len(pos)/len(neg)

15.16287588169781

In [7]:
# stratify split 
train_set, test_set = train_test_split(dataset, test_size = 0.1, train_size = 0.2,
                     stratify=dataset['target'], random_state=42)

In [8]:
pos1 = train_set[train_set['target'] == 0]
neg1 = train_set[train_set['target'] == 1]
len(pos1)/len(neg1)

15.162851132285608

In [9]:
len(train_set)

261224

In [10]:
len(test_set)

130613

In [11]:
stopwords_en = set(stopwords.words('english'))
stopwords_en_withpunct = stopwords_en.union(set(punctuation))
#stopwords_json = {"en":["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]}
#stopwords_combined = set.union(set(stopwords_json['en']), stopwords_en_withpunct)
stopwords = stopwords_en_withpunct

In [12]:
wnl = WordNetLemmatizer()

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n' 
    
def lemmatize_sent(text): 
    # Text input is string, returns lowercased strings.
    return [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) 
            for word, tag in pos_tag(text_to_word_sequence(text))]

In [13]:
def preprocess_text(text):
    # Input: str, i.e. document/sentence
    # Output: list(str) , i.e. list of lemmas
    
    tokens =  [word for word in lemmatize_sent(text) 
            if word not in stopwords
            and not word.isdigit()]
    try: 
        tokens = normalise.normalise(text=tokens, user_abbrevs=custom_dictionary, verbose=False)
    except:
        result = []
        for text in tokens:
            try:
                result.append(normalise.normalise(texts, verbose=False))
            except:
                result.append(text)
        tokens = result
    tokens = [word for word in tokens
          if word not in stopwords]
    return tokens;

In [14]:
class CustomVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        stop_words = self.get_stop_words()
    
        def analyser(doc):
            if (self.lowercase == True):
                doc = doc.lower()
            tokens = preprocess_text(doc)
            
            return(self._word_ngrams(tokens, stop_words))
        return (analyser)

In [15]:
# initialize
max_features_TFIDF = 1000
ngram_range_TFIDF = (1, 1)

tfidf_vectorizer = CustomVectorizer(stop_words=stopwords,
                                    ngram_range=ngram_range_TFIDF,
                                    max_features=max_features_TFIDF,
                                    encoding='utf-8',
                                    decode_error='strict',
                                    strip_accents = None,
                                    lowercase=True)

In [16]:
tfidf_train = tfidf_vectorizer.fit_transform(train_set['question_text'])
tfidf_test = tfidf_vectorizer.fit_transform(test_set['question_text'])

In [17]:
train_set['target'] = train_set['target'].replace(1,-1)
train_set['target'] = train_set['target'].replace(0,1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [18]:
test_set['target'] = test_set['target'].replace(1,-1)
test_set['target'] = test_set['target'].replace(0,1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [20]:
# save
import pickle
# pickle.dump(testing, open("testing.pickle", "wb"))

# 1.1. oneclassSVM   

In [19]:
from sklearn import svm

In [20]:
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(tfidf_train)

#tiny training set
y_pred_train = clf.predict(tfidf_train)

y_pred_test = clf.predict(tfidf_test) 

# n_error_train = y_pred_train[y_pred_train == -1].size

In [21]:
y_pred_train

array([ 1,  1,  1, ..., -1,  1,  1])

In [22]:
print("Accuracy score: %f"%accuracy_score(y_true=test_set['target'], y_pred=y_pred_test))
print("Recall score: %f"%recall_score(y_true=test_set['target'], y_pred=y_pred_test, average='weighted'))
print("F1 score: %f"%f1_score(y_true=test_set['target'], y_pred=y_pred_test,average='weighted'))

Accuracy score: 0.751686
Recall score: 0.751686
F1 score: 0.807525
