In [2]:
#http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

categories_training_solutions = []
categories_list = []
questions_list  = []
with open("corpora/QuestoesConhecidas.txt") as corpus:
        for line in corpus:
            (category, question) = line.split(maxsplit = 1)
            categories_training_solutions.append(category)
            questions_list.append(question.rstrip())

In [3]:
from nltk.stem.lancaster import LancasterStemmer
def tokenizePhrase(question):
    """Given a question, stem the words into lexemes ignoring stopwords (of, the, etc...)

    Args:
        question: A string corresponding to the question to stem.

    Returns:
        A list of lexemes.
        Example question: "What is my name?"
        Example returned list: ['what', 'nam']
    """

    stemmer = LancasterStemmer()
    tokenizedWords=[]
    for word in nltk.word_tokenize(question):
        # if the word is one of the stopwords (generic question articulators) ignore it
        if (word not in ('?', ':', '.', ',' "'s")) and (word not in stopwords.words('english')):
            tokenizedWords.append(stemmer.stem(word))
    return tokenizedWords

In [4]:
categories_list = list(set(categories_training_solutions))
categories_list

['original_language',
 'person_name',
 'revenue',
 'spoken_language',
 'runtime',
 'actor_name',
 'genre',
 'release_date',
 'production_country',
 'original_title',
 'vote_avg',
 'production_company',
 'overview',
 'budget',
 'keyword',
 'character_name']

In [5]:
#Text preprocessing, tokenizing and filtering of stopwords are all included in CountVectorizer, which builds a dictionary of features and transforms documents to feature vectors:

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_question_counts = count_vect.fit_transform(questions_list)
X_question_counts.shape

(208, 392)

In [6]:
#Term Frequency times Inverse Document Frequency

from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_question_counts)
X_question_tf = tf_transformer.transform(X_question_counts)
X_question_tf.shape

(208, 392)

In [7]:
#Prepare data to fit the estimator
tfidf_transformer = TfidfTransformer()
X_question_tfidf = tfidf_transformer.fit_transform(X_question_counts)
X_question_tfidf.shape

(208, 392)

In [8]:
### TRAINING CLASSIFIER ###
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_question_tfidf, categories_training_solutions)

In [9]:
#Predict test
docs_new = ['What budget do I own?']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)
predicted

array(['original_title'], dtype='<U18')

In [10]:
from nltk.corpus import stopwords
tokenizePhrase("What is my role in this project?")

NameError: name 'nltk' is not defined

In [11]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

In [12]:
text_clf.fit(questions_list, categories_training_solutions)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [13]:
import numpy as np
docs_test = questions_list
predicted = text_clf.predict(docs_test)
np.mean(predicted == categories_training_solutions) 

0.4807692307692308

In [14]:
#Lets load the testing sets
testing_questions_list = []
with open("corpora/NovasQuestoes.txt") as testing_corpus:
        for line in testing_corpus:
            question = line.rstrip()
            testing_questions_list.append(question)

testing_solutions_list = []
with open("corpora/NovasQuestoesResultados.txt") as testing_solutions_corpus:
        for line in testing_solutions_corpus:
            category = line.rstrip()
            testing_solutions_list.append(category)

In [15]:
#Let’s see if we can do better with a linear support vector machine (SVM), which is widely regarded as one of 
#the best text classification algorithms (although it’s also a bit slower than naïve Bayes).
#We can change the learner by simply plugging a different classifier object into our pipeline:
#original hinge

from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None)),
])
text_clf.fit(questions_list, categories_training_solutions)  

predicted = text_clf.predict(testing_questions_list)
np.mean(predicted == testing_solutions_list)

0.6666666666666666

In [16]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='epsilon_insensitive', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None)),
])
text_clf.fit(questions_list, categories_training_solutions)  

predicted = text_clf.predict(testing_questions_list)
np.mean(predicted == testing_solutions_list)

0.9523809523809523

In [21]:
from sklearn.feature_extraction.text import HashingVectorizer
text_clf = Pipeline([('vect', HashingVectorizer(analyzer='word')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='epsilon_insensitive', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None)),
])
text_clf.fit(questions_list, categories_training_solutions)  

predicted = text_clf.predict(testing_questions_list)
np.mean(predicted == testing_solutions_list)

0.9523809523809523

In [18]:
manual_test = ["How long does Regateiro teaches?"]
text_clf.predict(manual_test)[0]

'runtime'

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#CLASSIFIERS TEST
classifiers = []
classifiers.append(("KNeighborsClassifier", KNeighborsClassifier(3)))
classifiers.append(("SVC linear", SVC(kernel="linear", C=0.025)))
classifiers.append(("SVC gamma", SVC(gamma=2, C=1)))
classifiers.append(("DecisionTreeClassifier", DecisionTreeClassifier()))
classifiers.append(("RandomForestClassifier", RandomForestClassifier()))


for (name, classifier) in classifiers:
    text_clf2 = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', classifier),])
    text_clf2.fit(questions_list, categories_training_solutions)  

    predicted = text_clf2.predict(testing_questions_list)
    print("{}: {}".format(name, np.mean(predicted == testing_solutions_list)))

KNeighborsClassifier: 0.8333333333333334
SVC linear: 0.0
SVC gamma: 0.16666666666666666
DecisionTreeClassifier: 0.9047619047619048
RandomForestClassifier: 0.7857142857142857


  from numpy.core.umath_tests import inner1d


In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDRegressor
#CLASSIFIERS TEST for SGD
classifiers = []
classifiers.append(("EI l2", SGDClassifier(loss='epsilon_insensitive', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None)))
classifiers.append(("EI l1", SGDClassifier(loss='epsilon_insensitive', penalty='l1',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None)))
classifiers.append(("EI learningrate", SGDClassifier(loss='epsilon_insensitive', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None, learning_rate="invscaling", eta0=6)))
classifiers.append(("EI constant", SGDClassifier(loss='epsilon_insensitive', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None, learning_rate="constant", eta0=2)))


for (name, classifier) in classifiers:
    text_clf2 = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', classifier),])
    text_clf2.fit(questions_list, categories_training_solutions)  

    predicted = text_clf2.predict(testing_questions_list)
    print("{}: {}".format(name, np.mean(predicted == testing_solutions_list)))

EI l2: 0.9523809523809523
EI l1: 0.7857142857142857
EI learningrate: 0.8809523809523809
EI constant: 0.7857142857142857
