In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score
from nltk.corpus import stopwords
import string
import pickle
import re
from spacy.en import English
parser = English()


In [16]:
# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]

# Every step in a pipeline needs to be a "transformer". 
# Define a custom transformer to clean text using spaCy
class CleanTextTransformer(TransformerMixin):
    """
    Convert text to cleaned text
    """

    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}
    
# A custom function to clean the text before sending it into the vectorizer
def cleanText(text):
    # get rid of newlines
    text = text.strip().replace("\n", " ").replace("\r", " ")
    
    # replace twitter @mentions
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    text = mentionFinder.sub("@MENTION", text)
    
    # replace HTML symbols
    text = text.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
    
    # lowercase
    text = text.lower()

    return text

# A custom function to tokenize the text using spaCy
# and convert to lemmas
def tokenizeText(sample):

    # get the tokens using spaCy
    tokens = parser(sample)

    # lemmatize
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
    tokens = [tok for tok in tokens if tok not in STOPLIST]

    # stoplist symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens

def printNMostInformative(vectorizer, clf, N):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names)) 
#     print coefs_with_fns
    topClass1 = coefs_with_fns[:N]
    topClass2 = coefs_with_fns[:-(N + 1):-1]
    print("Class 1 best: ")
    for feat in topClass1:
        print(feat)
    print("Class 2 best: ")
    for feat in topClass2:
        print(feat)
    


In [17]:
# # data

# train = ["I love space. Space is great.", "Planets are cool. I am glad they exist in space", 
#         "lol @twitterdude that is gr8", "twitter &amp; reddit are fun.", 
#         "Mars is a planet. It is red.", "@Microsoft: y u skip windows 9?", 
#         "Rockets launch from Earth and go to other planets.", "twitter social media &gt; &lt;", 
#         "@someguy @somegirl @twitter #hashtag", "Orbiting the sun is a little blue-green planet."]
# labelsTrain = ["space", "space", "twitter", "twitter", "space", "twitter", "space", "twitter", "twitter", "space"]

# test = ["i h8 riting comprehensibly #skoolsux", "planets and stars and rockets and stuff"]
# labelsTest = ["twitter", "space"]

In [73]:
# Prepare data
label_pair =  pickle.load( open( "../data/label_pair.p", "rb" ) )



def prepare_data(label_pair,group = ''):
    clean_p=[p.values() for p in label_pair.values()]
    clean_p = [pp for p in clean_p for pp in p]
    clean_p= [p for p in clean_p if p[2]!=['ambiguous']]
    docs,sentiments,relations = zip(*clean_p)
    relations = [r[0] for r in relations]
    #filter class
    if len(group)>0:
        relations = [r if group==r else 'other' for r in relations ]
    docs = list(docs)
    train, test, labelsTrain, labelsTest = train_test_split(docs, relations, test_size=0.3, random_state=66)
    return train, test, labelsTrain, labelsTest



In [76]:
def get_model():
    # the vectorizer and classifer to use
    # note that I changed the tokenizer in CountVectorizer to use a custom function using spaCy's tokenizer
    vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))
    clf = LinearSVC()
    # clf = svm.SVC(decision_function_shape='ovo')

    # the pipeline to clean, tokenize, vectorize, and classify
    pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])
    return clf,pipe,vectorizer

def model_main(label_pair,group=''):
    # train
    clf,pipe,vectorizer = get_model()
    train, test, labelsTrain, labelsTest = prepare_data(label_pair,group)
    pipe.fit(train, labelsTrain)

    # test
    preds = pipe.predict(test)
    print("----------------------------------------------------------------------------------------------")
    print("results:")
    cnt = 0
    for (sample, pred) in zip(test, preds):
        print(sample, ":", pred)
        cnt += 1
        if cnt>15000:
            break
    for (sample, pred) in zip(train, pipe.predict(train)):
        print(sample, ":", pred)
    relation_code = dict([('family',0),('friend',1),('romance',2),('enemy',3),
                          ('acquaintance',4),('service',5),('other',6)])
    tocode = lambda x:relation_code[x]
    labelsTest, preds = map(tocode,labelsTest), map(tocode,preds)
    print('accuracy:{}, recall_score:{}, precision_score:{}'.format(accuracy_score(labelsTest, preds),
                        precision_score(labelsTest, preds,average='macro'),recall_score(labelsTest, preds,average='macro') ))
#     print("accuracy:", accuracy_score(labelsTest, preds))

    print("----------------------------------------------------------------------------------------------")
    print("Top 10 features used to predict: ")
    # show the top features
    printNMostInformative(vectorizer, clf, 10)

    print("----------------------------------------------------------------------------------------------")
    print("The original data as it appeared to the classifier after tokenizing, lemmatizing, stoplisting, etc")
    # let's see what the pipeline was transforming the data into
    pipe0 = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer)])
    transform = pipe0.fit_transform(train, labelsTrain)

    # get the features that the vectorizer learned (its vocabulary)
    vocab = vectorizer.get_feature_names()
    return pipe
    # the values from the vectorizer transformed data (each item is a row,column index with value as # times occuring in the sample, stored as a sparse matrix)
    # cnt = 0
    # for i in range(len(train)):
    #     s = ""
    #     indexIntoVocab = transform.indices[transform.indptr[i]:transform.indptr[i+1]]
    #     numOccurences = transform.data[transform.indptr[i]:transform.indptr[i+1]]
    #     for idx, num in zip(indexIntoVocab, numOccurences):
    #         s += str((vocab[idx], num))
    #     print("Sample {}: {}".format(i, s))
    #     cnt += 1
    #     if cnt == 5:
    #         break

In [77]:
trained_model  =model_main(label_pair)



----------------------------------------------------------------------------------------------
results:
(u'Candy believes that Mathu killed Beau , but as Mathu is virtually Candy foster father Candy wants to make every effort to protect father . The Sheriff still believes that Mathu murdered Beau , but can do nothing so everyone just waits to see if Fix Bauton and a lynch mob will show up .', ':', 'friend')
(u'Clarissa Vaughn leaves Vaughn New Yorkapartment to buy flowers . Clarissa Vaughn bumps into Vaughn live-in lover Sally onthe way back to their apartment . Clarissa Vaughn receives a visit from Vaughn old friend Louisat Vaughn apartment .', ':', 'friend')
(u"When Bilbo steals a golden cupfrom the dragon 's hoard , Smaug is furious and flies out of the mountainto burn Lake Town in Bilbo rage .", ':', 'service')
(u"Time passes , but the narrator never writes to Sonny in prison until the narrator 's young daughter , Grace , dies .", ':', 'family')
(u"Iola rebuffs Dr. Gresham 's propo

In [35]:
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(trained_model, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
loaded_model.predict(['Before Sheriff Mapes can take Charlie in , Luke Will and Will crew arrive . They demand that Mapes hand Charlie over .'])




array(['friend'], 
      dtype='|S12')

In [151]:
model_main(label_pair,'family')



----------------------------------------------------------------------------------------------
results:
(u"Embarrassed , the representative stresses that White is simply obeying Maw and Meggins 's orders .", ':', 'other')
(u'Before Sheriff Mapes can take Charlie in , Luke Will and Will crew arrive . They demand that Mapes hand Charlie over .', ':', 'other')
(u"Macbeth writesahead to Duncan wife , Lady Macbeth , telling Macbeth all that has happened . husband and Lady Macbeth plan to get Duncan 's two chamberlainsdrunk so they will black out ; the next morning they will blame themurder on the chamberlains , who will be defenseless , as they willremember nothing .", ':', 'family')
(u'One couple , Chrissie and Rodney , are especially interested in Hailsham . In Norfolk , Chrissie and Rodney ask about a rumored exception allowing Hailsham couples in love to defer their donations .', ':', 'family')
(u'The evening ends when Earle clubs the flirtatious Miguel on the head and Tod futilely chas

In [152]:
model_main(label_pair,'romance')



----------------------------------------------------------------------------------------------
results:
(u"Embarrassed , the representative stresses that White is simply obeying Maw and Meggins 's orders .", ':', 'other')
(u'Before Sheriff Mapes can take Charlie in , Luke Will and Will crew arrive . They demand that Mapes hand Charlie over .', ':', 'other')
(u"Macbeth writesahead to Duncan wife , Lady Macbeth , telling Macbeth all that has happened . husband and Lady Macbeth plan to get Duncan 's two chamberlainsdrunk so they will black out ; the next morning they will blame themurder on the chamberlains , who will be defenseless , as they willremember nothing .", ':', 'other')
(u'One couple , Chrissie and Rodney , are especially interested in Hailsham . In Norfolk , Chrissie and Rodney ask about a rumored exception allowing Hailsham couples in love to defer their donations .', ':', 'other')
(u'The evening ends when Earle clubs the flirtatious Miguel on the head and Tod futilely chases

In [153]:
model_main(label_pair,'enemy')



----------------------------------------------------------------------------------------------
results:
(u"Embarrassed , the representative stresses that White is simply obeying Maw and Meggins 's orders .", ':', 'other')
(u'Before Sheriff Mapes can take Charlie in , Luke Will and Will crew arrive . They demand that Mapes hand Charlie over .', ':', 'other')
(u"Macbeth writesahead to Duncan wife , Lady Macbeth , telling Macbeth all that has happened . husband and Lady Macbeth plan to get Duncan 's two chamberlainsdrunk so they will black out ; the next morning they will blame themurder on the chamberlains , who will be defenseless , as they willremember nothing .", ':', 'other')
(u'One couple , Chrissie and Rodney , are especially interested in Hailsham . In Norfolk , Chrissie and Rodney ask about a rumored exception allowing Hailsham couples in love to defer their donations .', ':', 'other')
(u'The evening ends when Earle clubs the flirtatious Miguel on the head and Tod futilely chases

In [154]:
model_main(label_pair,'service')



----------------------------------------------------------------------------------------------
results:
(u"Embarrassed , the representative stresses that White is simply obeying Maw and Meggins 's orders .", ':', 'other')
(u'Before Sheriff Mapes can take Charlie in , Luke Will and Will crew arrive . They demand that Mapes hand Charlie over .', ':', 'other')
(u"Macbeth writesahead to Duncan wife , Lady Macbeth , telling Macbeth all that has happened . husband and Lady Macbeth plan to get Duncan 's two chamberlainsdrunk so they will black out ; the next morning they will blame themurder on the chamberlains , who will be defenseless , as they willremember nothing .", ':', 'other')
(u'One couple , Chrissie and Rodney , are especially interested in Hailsham . In Norfolk , Chrissie and Rodney ask about a rumored exception allowing Hailsham couples in love to defer their donations .', ':', 'other')
(u'The evening ends when Earle clubs the flirtatious Miguel on the head and Tod futilely chases

In [155]:
model_main(label_pair,'acquaintance')



----------------------------------------------------------------------------------------------
results:
(u"Embarrassed , the representative stresses that White is simply obeying Maw and Meggins 's orders .", ':', 'other')
(u'Before Sheriff Mapes can take Charlie in , Luke Will and Will crew arrive . They demand that Mapes hand Charlie over .', ':', 'other')
(u"Macbeth writesahead to Duncan wife , Lady Macbeth , telling Macbeth all that has happened . husband and Lady Macbeth plan to get Duncan 's two chamberlainsdrunk so they will black out ; the next morning they will blame themurder on the chamberlains , who will be defenseless , as they willremember nothing .", ':', 'other')
(u'One couple , Chrissie and Rodney , are especially interested in Hailsham . In Norfolk , Chrissie and Rodney ask about a rumored exception allowing Hailsham couples in love to defer their donations .', ':', 'other')
(u'The evening ends when Earle clubs the flirtatious Miguel on the head and Tod futilely chases

In [156]:
model_main(label_pair,'friend')



----------------------------------------------------------------------------------------------
results:
(u"Embarrassed , the representative stresses that White is simply obeying Maw and Meggins 's orders .", ':', 'other')
(u'Before Sheriff Mapes can take Charlie in , Luke Will and Will crew arrive . They demand that Mapes hand Charlie over .', ':', 'friend')
(u"Macbeth writesahead to Duncan wife , Lady Macbeth , telling Macbeth all that has happened . husband and Lady Macbeth plan to get Duncan 's two chamberlainsdrunk so they will black out ; the next morning they will blame themurder on the chamberlains , who will be defenseless , as they willremember nothing .", ':', 'other')
(u'One couple , Chrissie and Rodney , are especially interested in Hailsham . In Norfolk , Chrissie and Rodney ask about a rumored exception allowing Hailsham couples in love to defer their donations .', ':', 'friend')
(u'The evening ends when Earle clubs the flirtatious Miguel on the head and Tod futilely chas

In [50]:
def model_test(label_pair,name,clf = LinearSVC(), group=''):
    # train
    vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))
    clf = clf
    # the pipeline to clean, tokenize, vectorize, and classify
    pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])
    train, test, labelsTrain, labelsTest = prepare_data(label_pair,group)
    pipe.fit(train, labelsTrain)

    # test
    preds = pipe.predict(test)
    print("----------------------------------------------------------------------------------------------")
    print("results:{}".format(name))
    relation_code = dict([('family',0),('friend',1),('romance',2),('enemy',3),
                          ('acquaintance',4),('service',5),('other',6)])
    tocode = lambda x:relation_code[x]
    labelsTest, preds = map(tocode,labelsTest), map(tocode,preds)
    print('accuracy:{}, recall_score:{}, precision_score:{}'.format(accuracy_score(labelsTest, preds),
                        precision_score(labelsTest, preds,average='macro'),recall_score(labelsTest, preds,average='macro') ))


In [54]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "AdaBoost",'Gradient Boost', "Neural Net",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    MLPClassifier(alpha=1),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

for name,model in zip(names,classifiers):
    try:
        model_test(label_pair,name,model)
    except:
        print("----------------------------------------------------------------------------------------------")
        print('DOES NOT WORK:{}'.format(name))



----------------------------------------------------------------------------------------------
results:Nearest Neighbors
accuracy:0.613636363636, recall_score:0.544642857143, precision_score:0.370005199049
----------------------------------------------------------------------------------------------
results:Linear SVM
accuracy:0.568181818182, recall_score:0.688293650794, precision_score:0.251589423648
----------------------------------------------------------------------------------------------
results:RBF SVM
accuracy:0.55303030303, recall_score:0.507452574526, precision_score:0.227283868093
----------------------------------------------------------------------------------------------
DOES NOT WORK:Gaussian Process
----------------------------------------------------------------------------------------------
results:Decision Tree
accuracy:0.530303030303, recall_score:0.421188630491, precision_score:0.195707070707
------------------------------------------------------------------------