In [6]:
import nltk
nltk.download('stopwords')
nltk.download('perluniprops')
nltk.download('nonbreaking_prefixes')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jfuentes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to
[nltk_data]     /Users/jfuentes/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /Users/jfuentes/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!


True

In [2]:
from ourDb import events_collection, total_events_collection, events_ml_collection
import re
import nltk
import string
from nltk.tokenize.moses import MosesTokenizer

from gensim.models import Phrases
from nltk.corpus import wordnet as wn

In [15]:
class PreprocessText:
    def __init__(self, categorizedEvents):
        """categorizedEvents should be a list of dictionaries each corresponding to an event
            X is the tokenized preprocessed text
            Y is the corresponding categories
            phraseMl is the phrase model that can further trained and used
            phrases is a list of all the phrases identified"""
        self.X = []
        self.Y = []
        tokenizer = MosesTokenizer() #tokenizers are basically an advanced split
        for e in categorizedEvents:
            text = e[u'name'] + " " + e[u'description']
            text = tokenizer.tokenize(text)
            text = self.preprocess(text)
            self.X.append(text)
            self.Y.append(e[u'category'])
        self.phraseMl = Phrases(self.X, min_count=3) #learn ml model for phrase
        self.X = list(self.phraseMl[self.X]) #use ml model for phrases
#         self.X = list(self.phraseMl[self.X]) #get triples
        self.phrases = phrases = set([w for doc in self.X for w in doc if '_' in w])
        
    def matchNotX(self, strg, search=re.compile(r'[^!#$%&()*+,-./:;<=>?@\\^_}|{~0123456789]').search):
        """make sure word has something than punctuation"""
        return bool(search(strg)) #make sure word has something other than punctuation

    def preprocess(self, text):
        """Remove all useless words and punct, make lowercase"""
        stoplist = set('for a of the and to in . / '.split())
        stoplist = set(nltk.corpus.stopwords.words('english')) | stoplist | set(string.punctuation)
        return [word.strip(string.punctuation).lower() for word in text if word not in stoplist and self.matchNotX(word)]    
        
    def topBigrams(self, texts, n, tri=False):
        """Other method of getting phrases, currently unused because phrases can be further trained(online) and saved"""
        flatTexts = []
        for text in texts:
            for word in text:
                flatTexts.append(word)
        bigram_measures = nltk.collocations.BigramAssocMeasures()
        trigram_measures = nltk.collocations.TrigramAssocMeasures()
        topAnswers = []
        if tri:
            finder = nltk.collocations.TrigramCollocationFinder.from_words(flatTexts)
            finder.apply_freq_filter(7)
            return finder.nbest(trigram_measures.pmi, n)
        else:
            finder = nltk.collocations.BigramCollocationFinder.from_words(flatTexts)
            finder.apply_freq_filter(7)
            return finder.nbest(bigram_measures.pmi, n)

In [19]:
def gatherCategorizedEvents():
    allCategorizedEvents = []
    allEvents = total_events_collection.find({}, {"category": 1, "description": 1, "name": 1, "_id": 0})
    count = 0
    for e in allEvents:
        count += 1
        if 'category' in e and 'description' in e and 'name' in e:
            allCategorizedEvents.append(e)
    print count, "total events, learning from the", len(allCategorizedEvents), "categorized events"
    return allCategorizedEvents

In [20]:
X = gatherCategorizedEvents()
skText = [e['name']+' '+e['description'] for e in X]
skTarget = [e['category'] for e in X]

1041 total events, learning from the 411 categorized events


In [68]:
from sklearn.cross_validation import train_test_split
 
def train(classifier, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)
 
    classifier.fit(X_train, y_train)
    print "Accuracy: %s" % classifier.score(X_test, y_test)
    return classifier
 

# Model Making

In [69]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
 
trial1 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB()),
])
 
nbModel = train(trial1, skText, skTarget)

Accuracy: 0.49514563106796117


In [70]:
trial2 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB(alpha=0.05)),
])

nbModel = train(trial2, skText, skTarget)

Accuracy: 0.7475728155339806


In [71]:
stoplist = set('for a of the and to in . / '.split())
stoplist = set(nltk.corpus.stopwords.words('english')) | stoplist | set(string.punctuation)

# create the transform
vectorizer = TfidfVectorizer()

# tokenize and build vocab
X = vectorizer.fit_transform(skText)

# print(vectorizer.vocabulary_)
# print(vectorizer.idf_)

model = MultinomialNB(alpha=0.05)
model = train(model, X, skTarget)

Accuracy: 0.7669902912621359


In [None]:
nbModel.classes_


In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog.",
"The dog.",
"The fox"]
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
# summarize
print(vectorizer.vocabulary_)
print(vectorizer.idf_)
# encode document
vector = vectorizer.transform([text[0]])
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

{u'brown': 0, u'lazy': 4, u'jumped': 3, u'over': 5, u'fox': 2, u'dog': 1, u'quick': 6, u'the': 7}
[1.69314718 1.28768207 1.28768207 1.69314718 1.69314718 1.69314718
 1.69314718 1.        ]
(1, 8)
[[0.36388646 0.27674503 0.27674503 0.36388646 0.36388646 0.36388646
  0.36388646 0.42983441]]


In [22]:

trail = Pipeline([('int', int)])

TypeError: Last step of Pipeline should implement fit. '<type 'int'>' (type <type 'type'>) doesn't

[{u'category': u'WELLNESS',
  u'description': u'[Free event] This one-day workshop at UCLA Covel Common will present you why and how meditation could bring us emotion wellness, lower stress, more presence in personal interactions and a healthier approach to life! Instructors studied in the largest Buddhist academy in Tibet for many years. This 3-session-courses are friendly to beginners and enthusiasts Come and enjoy!',
  u'name': u'One-day Meditation Workshop'},
 {u'category': u'ART',
  u'description': u"The UCLA Art History Undergraduate Student Association (AHUSA) is organizing an exhibition in the Powell Library rotunda from Thursday, January 11, 2018 to Thursday, February 1, 2017.\nCuratorial Statement: Weather at UCLA and Los Angeles is perpetually temperate. While most places cycle through seasons, LA is perpetually the same year-round. The cyclical nature of seasons, political climates, and cultures necessitates adaptation.\nPowell Library, as an exhibition space, presents imme