# NPS classification

In [None]:
import pandas as pd
with open('/Users/yurii/Downloads/nps.tsv',
          mode = 'r',
          encoding = 'ascii',
          errors = 'ignore'
         ) as csvfile:
  nps_comments = pd.read_csv(csvfile, delimiter = '\t')

pt_comments = nps_comments.query("language == 'pt' and category")[['category', 'comment']]
pt_comments.columns = ['category', 'comment']
pt_comments.head()

In [None]:
# Imports
# Note: following nltk packages should be downloaded
import nltk
nltk.download('punkt')
from nltk import (
    sent_tokenize as splitter,
    wordpunct_tokenize as tokenizer
)

In [None]:
nltk.download('wordnet')
stemmer = nltk.SnowballStemmer(u'portuguese')

def stem(tokens_list):
    return [stemmer.stem(token) for token in tokens_list]

In [None]:
# Splits a string into sentences and words.
def tokenize(text):
  return [tokenizer(sentence) for sentence in splitter(text)]

# In this exercise we do not care about the sentences (if any),
# so let's flatten the list.
def flatten(nested_list):
  return [item for sublist in nested_list for item in sublist]

def tokenize_flatten_df(row, field):
  return flatten(tokenize(row[field]))

import re

# remove urls
def remove_urls(text):
  return re.sub(r"(https?\://)\S+", "", text)

# remove mentions (@name) completely
def remove_mentions(text):
  return re.sub(r"@[^:| ]+:? ?", "", text)

# remove "RT:", if the tweet contains it.
def remove_rt(text):
  if text.lower().startswith("rt:"):
    return text[3:].strip()
  return text
def remove_urls_mentions_rt_df(row, field):
  return remove_rt(remove_mentions(remove_urls(row[field])))

def replace_hashtags_from_text(text):
  return re.sub(r"#+ ?", "", text)
# remove hashtags
def replace_hashtags_from_list(tokens_list):
  return [token for token in tokens_list if token != "#"]

# remove digits
def remove_digits(tokens_list):
  return [token for token in tokens_list 
                if not re.match(r"[-+]?\d+(\.[0-9]*)?$", token)]

# remove all tokens that contains non alpha numeric, punctuation
def remove_containing_non_alphanum(tokens_list):
  return [token for token in tokens_list if token.isalpha()]
# lowercase everything
def lowercase_list(tokens_list):
  return [token.lower() for token in tokens_list]

from nltk.corpus import stopwords
nltk.download('stopwords')
# remove stopwords
def remove_stopwords(tokens_list):
  return [token for token in tokens_list
                if not token in stopwords.words(u'portuguese')]

import spacy
nlp = spacy.load("pt_core_news_sm", disable=["tagger", "parser", "ner"])

def lemmatize(token):
    return nlp(token)[0].lemma_

def lemmatize_words(tokens_list):
    return [lemmatize(token) for token in tokens_list]

# Iterates over the elements of the list with tokens and performs cleanup.
def clean_tokens(row, field):
      return stem(
          replace_hashtags_from_list(
                remove_digits(
                    remove_containing_non_alphanum(
                        lowercase_list(
                            remove_stopwords(
                                lemmatize_words(row[field])))))))



In [None]:
pt_comments['cleaned_comment'] = \
    pt_comments.apply(
        lambda row: remove_urls_mentions_rt_df (row, 'comment'), axis=1)

pt_comments['text_tokenized'] = \
    pt_comments.apply(
        lambda row:
            tokenize_flatten_df (row, 'comment'), axis=1)

pt_comments['tokens'] = \
    pt_comments.apply(
        lambda row:
            clean_tokens (row, 'text_tokenized'), axis=1)

pt_comments.head()

## Running Naive Bayes classifiers

In [None]:
# Group the small groups under 'other' category
print(pt_comments.groupby(['category']).count().query("comment > 100"))
dominant_labels = [
    "boleto-payin",
    "borderless",         
    "cost",               
    "customer-support",   
    "happy-customer",     
    "interface",          
    "limits",             
    "payin-methods",      
    "product-comms",      
    "pt-content",         
    "speed",              
    "ted-payin",          
    "verification"]

labels = []
for label in pt_comments["category"].values:
    labels.append(label if label in dominant_labels else "other_categories")

In [None]:
# Generating corpus of texts with corresponding golden labels.
import numpy as np

corpus = []
for i, (document_id, row) in enumerate(pt_comments.iterrows()):
  corpus.append(" ".join(row['tokens']))

In [None]:
# We split the data into train/tet to avoid overfitting. Another strategy would be to do cross-validation, as below.
from sklearn.model_selection import train_test_split

corpus_train, corpus_test, labels_train, labels_test = train_test_split(
   corpus, labels, test_size=0.20, random_state=1024)

In [None]:
#@title Get feature representation of documents
#    You can do it manually, just for fun, or we can already use some libs.
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

vectorizer = CountVectorizer()

# We build document-term matrix for each dataset:
document_term_matrix_train = vectorizer.fit_transform(corpus_train).toarray()
document_term_matrix_test = vectorizer.transform(corpus_test).toarray()

In [None]:
# Configuring evaluation function.
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

def evaluate_classifier(classifier, X_train, X_test, y_train, y_test):
  classifier.fit(X_train, y_train)
  predicted_y_test = classifier.predict(X_test)
  print("Accuracy:", accuracy_score(y_test, predicted_y_test))
  report = classification_report(y_test, predicted_y_test)
  print(report)

In [None]:
#@title Run evaluation with MultinomialNB:
from sklearn.naive_bayes import MultinomialNB

classifier = evaluate_classifier(MultinomialNB(),
                    document_term_matrix_train, document_term_matrix_test,
                    labels_train, labels_test)
classifier

In [None]:
#@title Run evaluation with Perceptron classifier:
from sklearn.linear_model import Perceptron
evaluate_classifier(Perceptron(),
                    document_term_matrix_train, document_term_matrix_test,
                    labels_train, labels_test)

In [None]:
#@title Run evaluation with Logistic regression classifier:
from sklearn.linear_model import LogisticRegression
evaluate_classifier(LogisticRegression(),
                    document_term_matrix_train, document_term_matrix_test,
                    labels_train, labels_test)

In [None]:
#@title Run evaluation with Linear SVM classifier:
from sklearn.svm import LinearSVC
evaluate_classifier(LinearSVC(),
                    document_term_matrix_train, document_term_matrix_test,
                    labels_train, labels_test)

Note: You can change the meta-parameters of classifiers, e.g. avoid using Prior for MNB, penalty for regulazation in LogRegression, etc.
<br>The optimal meta-parameters are usually optimized on a separate tuning dataset using cross-validation
<br>(see [GridSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) in scikit-learn or [Vizier](http://go/vizier) for Google internal optimization).

### Trying n-grams

In [None]:
# Check the results with n-grams.
ngram_vectorizer = CountVectorizer(ngram_range=(1,2))
document_ngram_matrix_train = ngram_vectorizer.fit_transform(corpus_train).toarray()
document_ngram_matrix_test = ngram_vectorizer.transform(corpus_test).toarray()

evaluate_classifier(LogisticRegression(), 
                    document_ngram_matrix_train, document_ngram_matrix_test,
                    labels_train, labels_test)