# BUSS6002 Week 11 Python Material

In [None]:
# Import and setup
%pylab notebook
import plotly.offline as py
import plotly.graph_objs as go
import numpy as np
import pandas as pd
py.init_notebook_mode()

# Python Interlude I: Rules Based Text Analytics
Here, we'll cover the practical aspects of text analytics that *don't* involve statistics, machine learning or models.

In [None]:
# Texta Analysis in SQL is possible, but not pretty

import sqlite3
import pandas as pd
pd.set_option('display.max_colwidth',160)
con = sqlite3.connect('data/kaggle_airline_twitter.sqlite')
my_query = 'SELECT tweet_id, name, text FROM Tweets WHERE text LIKE "%wait%" OR text like "%delay%" OR text like "%late%"'
df_tweets_sql = pd.read_sql_query(my_query, con)
print("Found {} matching tweets".format(len(df_tweets_sql)))
df_tweets_sql.head()

### Very simple text search in python
- We can treat a dataframe column as a string by using "str", and use the "contains" function. This gives us a true/false vector indicating which rows have a match in that column.
- Using "loc", we can select only those rows which are "True".

In [None]:
# Search for the word "bags" in any tweet.
matches_bag = df_tweets_sql.text.str.contains('bags')
df_tweets_sql.loc[matches_bag, :]

## A better way for text analysis: Python's NLTK package

In [None]:
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import EnglishStemmer
from nltk.stem.lancaster import LancasterStemmer

porter_stemmer = PorterStemmer()
snowball_stemmer = EnglishStemmer()
lancaster_stemmer = LancasterStemmer()

words = pd.Series(["wait", "waiter", "waited", "delay", "delays", "delayed", "delaying", "late", "later"])

pd.DataFrame({"Original": words,
              "Porter": words.apply(porter_stemmer.stem),
              "Snowball (Porter 2)": words.apply(snowball_stemmer.stem),
              "Lancaster": words.apply(lancaster_stemmer.stem),
             })

### Repeat the "delayed flight" search, with NLTK
- First stem all the words in the tweets, then search for stemmed versions of the words used in the SQL query.
- Then merge the sql search approach with the stemmed approach, and identify where different decisions were made.

In [None]:
import nltk
nltk.download('punkt')

In [None]:
# Get the whole table as a pandas dataframe.
con = sqlite3.connect('data/kaggle_airline_twitter.sqlite')
df_tweets = pd.read_sql_query('SELECT tweet_id, tweet_created, name, text FROM Tweets', con) 

def stem_tweet(tweet):
    """Break up a sentence, stem each word, and put it back together."""
    words = nltk.word_tokenize(tweet)
    stemmed_tweet = ' '.join([snowball_stemmer.stem(word) for word in words])
    return stemmed_tweet 
    
df_tweets['stemmed_text'] = df_tweets.text.apply(stem_tweet)
df_tweets.head()

In [None]:
# Stem each search term, then run the search.
search_terms = ['wait', 'delay', 'late']
search_terms = [snowball_stemmer.stem(term) for term in search_terms]
df_tweets['is_match'] = df_tweets.stemmed_text.str.contains('|'.join(search_terms))
print(df_tweets.is_match.sum())
df_tweets.head()

# Stemming vs Simple Search
- The simple search would match words like "chocolate" for search term "late", whereas stemming does not.
- But we discover an awkward, but very commonly occuring problem: The data set has been tampered with, but presented as clean / raw data (clearly every instance of "late" has been replaced by "Late Flight").

In [None]:
df_tweets_merged = (pd
                    .merge(df_tweets, df_tweets_sql, how='left', on='tweet_id')
                    .assign(is_match_sql = lambda d: pd.notnull(d.name_y))
                    .filter(['text_x', 'is_match', 'is_match_sql'])
                   )
display(df_tweets_merged.query('is_match != is_match_sql').head(10))

## Rules Based Search III: Regex
- Can represent highly complex deterministic queries.
- Is very difficult to learn/interpret (easiest if you can see results in real time, e.g. http://www.regexr.com/)

      '.*[ ]happy.*' # Simple regex searching for "happy" preceeded by a space (to avoid "unhappy")
    

Example: Functional date validator in format dd/mm/yyyy, dd-mm-yyyy or dd.mm.yyyy. It allows leading zeros but does not require them.
      
      ^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$

In [None]:
regexp = '.*[ ]happy.*'
df_tweets.loc[df_tweets.text.str.contains(regexp), ('name', 'text')].head(10)

In [None]:
regexp_1 = '^(0?[1-9]|[12][0-9]|3[01])[\/\-](0?[1-9]|1[012])[\/\-]\d{4}$'
possible_dates = pd.Series(['01/01/2015', '1/1/2000', 'raspberry', '15-05-2017', '30/02/1988', '31/02/1988'])
(possible_dates
 .to_frame()
 .assign(regexp_1 = possible_dates.str.contains(regexp_1))
)

In [None]:
regexp_2 = '(^(((0[1-9]|1[0-9]|2[0-8])[\/](0[1-9]|1[012]))|((29|30|31)[\/](0[13578]|1[02]))|((29|30)[\/](0[4,6,9]|11)))[\/](19|[2-9][0-9])\d\d$)|(^29[\/]02[\/](19|[2-9][0-9])(00|04|08|12|16|20|24|28|32|36|40|44|48|52|56|60|64|68|72|76|80|84|88|92|96)$)'
(possible_dates
 .to_frame()
 .assign(regexp_1 = possible_dates.str.contains(regexp_1), regexp_2 = possible_dates.str.contains(regexp_2))
)

## Rules Based Text Analysis: Sentiment Analysis
> "VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in social media."

- Be very careful with rules based sentiment (typically assigning positive/negative points for each word, and adding them up). If a meal in a restaurant is "to die for", that's a good thing!
- VADER is specifically tuned for Twitter (which is a very distinctive data set), so we're pretty safe here...
- VADER has approx 600 lines of code, and a "lexicon" (dictionary) of over 7,000 terms.

https://github.com/cjhutto/vaderSentiment

In [None]:
import nltk
nltk.download('vader_lexicon')

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sent = SentimentIntensityAnalyzer()

In [None]:
# Assign polarity scores to measure sentiment intensity
df_tweets = (df_tweets
 .assign(sentiment=df_tweets.text.apply(lambda s: sent.polarity_scores(s)['compound']))
 .sort_values('sentiment', ascending=False)
)
df_tweets.filter(['text', 'sentiment']).iloc[::1000,:]

# Model Based Text Analytics

## Supervised N-Gram Models

In [None]:
## Split into training and test set by dates

figure(figsize=(10,3))
df_tweets['date'] = pd.to_datetime(df_tweets.tweet_created)
df_tweets_train = df_tweets[df_tweets.date < '2015-02-23']
df_tweets_test = df_tweets[df_tweets.date >= '2015-02-23']
df_tweets_train['date'].hist(bins=15)
df_tweets_test['date'].hist(bins=5)

In [None]:
from sklearn.pipeline import Pipeline                         
#Pipeline of transforms with a final estimator
from sklearn.feature_extraction.text import CountVectorizer   
# Convert a collection of text documents to a matrix of token counts
from sklearn.feature_extraction.text import TfidfTransformer  
# Transform count matrix to a normalized tf-idf representation
from sklearn.linear_model import SGDClassifier                
# regularized linear models with stochastic gradient descent 
from sklearn.model_selection import RandomizedSearchCV

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0), 
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1),),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__n_iter': (10, 50, 80),
}

pipeline_search = RandomizedSearchCV(pipeline, 
                                     parameters, 
                                     n_iter =10, 
                                     scoring='roc_auc', 
                                     n_jobs=-1, 
                                     verbose=1)

In [None]:
# Assume that sentiment classifier above is the true label
pipeline_search.fit(df_tweets_train.text, df_tweets_train['sentiment'] > 0.5)
sent_pred_train = pipeline_search.predict(df_tweets_train.text)
sent_pred_test = pipeline_search.predict(df_tweets_test.text)

In [None]:
from sklearn import metrics
sent_test = df_tweets_test['sentiment'] > 0.5
print("Test set classification report")
print(metrics.classification_report(sent_test, sent_pred_test))

print("Test set confusion matrix")
print(metrics.confusion_matrix(sent_test, sent_pred_test))

In [None]:
vect = pipeline_search.best_estimator_.steps[0][-1]
vect

In [None]:
clf = pipeline_search.best_estimator_.steps[2][-1]
clf

In [None]:
# Investigate results

df_vocab = (pd.DataFrame([{'feature_index': v, 'term': k} for k, v in vect.vocabulary_.items()])
            .set_index('feature_index')
            .sort_index()
            .assign(coefficient = clf.coef_.flatten(), )
            .sort_values('coefficient')
           )
df_vocab['ngram'] = df_vocab.term.apply(lambda s: len(s.split(' ')))
display(df_vocab.head(10))


In [None]:
display(df_vocab.tail(10))

In [None]:
df_vocab.sample(20)

## Word Embedding
- In n-gram models we counted the number of occurrences of a word (n-gram) and encoded the feature with a random number.
- Word embedding aims to represent meaning with the assigned numbers.

> What if we could assign words a feature number (or vector of numbers), where similar words got similar numbers?

**Word2Vec** takes a text corpus as input and produces the word vectors as output. 

It first constructs a vocabulary from the training text data and then learns vector representation of words.
The resulting word vector file can be used as features in many natural language processing and machine learning applications.

> What if we could do maths with those vectors? 
> e.g King - Man + Woman = Queen

In [None]:
# Train a model to create word vectors with deep learning 

from gensim.models import Word2Vec
wv_model = Word2Vec(sentences=df_tweets.text.apply(lambda s: s.split(' ')))
wv_model

In [None]:
# Investigate the learned representations by finding the closest words for a user-specified word. 

df_words = pd.DataFrame({'word_a': ['delta', 'amazing', 'ridiculous', 'transfer', 'transfer', 'reimburse', 'reimburse'],
                         'word_b': ['united', 'wonderful', 'horrible', 'plane', 'change', 'claim', 'dog']})

df_words['similarity'] = df_words.apply(lambda r: wv_model.similarity(r.word_a, r.word_b), axis=1)
df_words['word_a_vec'] = df_words.word_a.apply(lambda w: wv_model[w])
df_words['word_b_vec'] = df_words.word_b.apply(lambda w: wv_model[w])

In [None]:
# Results

df_words