In [341]:
from csv import DictReader, DictWriter

import numpy as np
from numpy import array
import random
import nltk
import re

from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer, Normalizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.linear_model import SGDClassifier

kTARGET_FIELD = 'spoiler'
kTEXT_FIELD = 'sentence'
kTROPE_FIELD = 'trope'
kPAGE_FIELD = 'page'


class Featurizer:
    def __init__(self):
        self.vectorizer = FeatureUnion( 
        [       
                # Feature 1: word frequency
                ('bag of words', 
                  Pipeline([('extract_field', FunctionTransformer(lambda x: x[0], validate = False)),
                            ('count', TfidfVectorizer(stop_words = 'english'))])),            
                # Feature 2: whether or not a page word appear in sentence
                ('page in words',
                  Pipeline([('extract_field', FunctionTransformer(lambda x: [x[0], x[2]], validate = False)), 
                            ('page', PageTransformer())])),
                # Feature 3: appearance of some tropes, indicating spoiler
                ('type of trope', 
                  Pipeline([('extract_field', FunctionTransformer(lambda x: x[1], validate = False)),
                            ('count', CountVectorizer())]))                
        
            ])
    
    def train_feature(self, examples):
        return self.vectorizer.fit_transform(examples)
        
    def test_feature(self, examples):
        return self.vectorizer.transform(examples)


# Figure out the frequency of page words appearing in the sentence, like "Nikita"
class PageTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, examples):
        # return self and nothing else 
        return self
    
    def transform(self, examples):
        
        import numpy as np 
        from scipy.sparse import csr_matrix
            
        # Initiaize matrix 
        X = np.zeros((len(examples[0]), 1))
        
        # Loop over examples and count letters 
        for ii, x in enumerate(examples[0]):            
            sentence_word = nltk.word_tokenize(x)
            # original: DreamHigh, using re to separate them
            page = re.sub(r'([A-Z])', r' \1', examples[1][ii])   
            # get tokenizer of the sentence
            page_word = nltk.word_tokenize(page)
            # get the common words between sentence and page
            common_word = set(sentence_word).intersection(page_word)
            
            # remove "The" which is common in page from common_word
            if 'The' in common_word:
                common_word.remove('The')
                
            X[ii,:] = len(common_word)
        
        # do normalization
        X = preprocessing.normalize(X, norm='l2')
        return csr_matrix(X)     


if __name__ == "__main__":

    # Cast to list to keep it all in memory
    train = list(DictReader(open("/Users/yawen/Desktop/CSCI 5622 Machine Learning/Homework 6/feature_engineering/data/spoilers/train.csv", 'r')))
    test = list(DictReader(open("/Users/yawen/Desktop/CSCI 5622 Machine Learning/Homework 6/feature_engineering/data/spoilers/test.csv", 'r')))

    # CrossValidation: add shuffle
    npass=5
    train_ratio=0.7 #test_ratio=0.3
    ntrain=int(len(train)*train_ratio)
    ntest=len(train)-ntrain
    scores=np.zeros(npass)
    
    for ipass in range(0,npass):
        
        print '-----ipass='+str(ipass)+'----------'
        random.shuffle(train)
        this_train=train[0:ntrain]
        this_test=train[ntrain:]

        feat = Featurizer()
    
        labels = []
        for line in this_train:
            if not line[kTARGET_FIELD] in labels:
                labels.append(line[kTARGET_FIELD])

        print("Label set: %s" % str(labels))
        
        x_train = feat.train_feature([[x[kTEXT_FIELD] for x in this_train], [x[kTROPE_FIELD] for x in this_train], [x[kPAGE_FIELD] for x in this_train]])
        x_test = feat.test_feature([[x[kTEXT_FIELD] for x in this_test], [x[kTROPE_FIELD] for x in this_test], [x[kPAGE_FIELD] for x in this_test]])

        y_train = array(list(labels.index(x[kTARGET_FIELD]) for x in this_train))

        # Train classifier, using LogisticRegression
        lr = SGDClassifier(loss='log', penalty='l2', shuffle=True)
        lr.fit(x_train, y_train)
        
        predictions = lr.predict(x_test)
        
        #verify score
        reference=array(list(labels.index(x[kTARGET_FIELD]) for x in this_test))
        diff=np.sum(np.absolute(reference-predictions))
        scores[ipass]=1.0-diff*1.0/ntest
        print(scores[ipass])
        
    #print scores
    print np.mean(scores)


-----ipass=0----------
Label set: ['False', 'True']
0.74526600541
-----ipass=1----------
Label set: ['False', 'True']
0.734445446348
-----ipass=2----------
Label set: ['False', 'True']
0.740982867448
-----ipass=3----------
Label set: ['True', 'False']
0.738052299369
-----ipass=4----------
Label set: ['True', 'False']
0.73219116321
0.738187556357


In [None]:
# calculate frequency of some tags in a sentence
class TagTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, examples):
        # return self and nothing else 
        return self
    
    def transform(self, examples):
        
        import numpy as np 
        from scipy.sparse import csr_matrix
        
        # can add more tags, 'CD' number
        tags = ['CD']
        
        # Initiaize matrix 
        X = np.zeros((len(examples), 1))
        
        # Loop over examples and count letters 
        for ii, x in enumerate(examples):
            tag = nltk.pos_tag(nltk.word_tokenize(x))            
            X[ii,0] = [t[1] for t in tag].count('CD') 
        print X
        
        # normalization for a feature
        X = preprocessing.normalize(X, norm='l2')
        return csr_matrix(X) 

# get name_entity
def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    num_name_entity = 0
    current_chunk = []
    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                num_name_entity += 1 
                continuous_chunk.append(named_entity)
                current_chunk = []
            else:
                num_name_entity += 1 
                continue
    return num_name_entity

# get the frequency of name_entity in each sentence
class NameTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, examples):
        # return self and nothing else 
        return self
    
    def transform(self, examples):
        
        import numpy as np 
        from scipy.sparse import csr_matrix
        
        # Initiaize matrix 
        X = np.zeros((len(examples), 1))
        
        # Loop over examples and count letters 
        for ii, x in enumerate(examples):
            X[ii,:] = get_continuous_chunks(x)
            
        return csr_matrix(X) 

# Additional, can also try topic model with LDA
# https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html