In [33]:
import re
import string
import operator
import pickle
import time
import os
import tensorflow as tf

from collections import Counter
from itertools import accumulate
from functools import reduce

import nltk
import numpy as np
import pandas as pd
import ipyparallel as ipp
import matplotlib.pyplot as plt

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from tensorflow.keras.backend import clear_session
from tensorflow.keras import backend as K
from IPython.display import display, HTML

from ipy_logger import create_logger

In [34]:
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

N_SENT_SAMPLES = 1000
WANTED_COLS = ['Date', 'Hour', 'User Name', 'Tweet content', 'Is a RT', 'RTs']

In [35]:
@ipp.require('os')
def read_tweets(company):
    df = pd.read_csv(f'tweets/{company}.csv', nrows=30000)
    return df

In [36]:
@ipp.require(read_tweets, 're', 'string')
def preprocess(company):
    
    df = read_tweets(company)
    df = df[WANTED_COLS]
    df = df[df['Is a RT'] == False]
    
    mention_regex = re.compile(r'@[A-Za-z0-9]+')
    link_regex = re.compile(r'https?://[A-Za-z0-9./]+')
    symbol_regex = re.compile(r'\$[A-Za-z]+')

    def remove_patterns(tweet, rgxs):
        cleaned = tweet

        for rgx in rgxs:
            cleaned = re.sub(rgx, '', cleaned)
        return cleaned

    def clean_tweet(tweet):
        clean = remove_patterns(tweet, [mention_regex, link_regex, symbol_regex])
        clean = clean.replace('#', '').lower()
        return clean

    df['Clean Tweet'] = df['Tweet content'].apply(clean_tweet)
    df['Clean Tweet'] = df['Clean Tweet'].str.replace('\s+', ' ').str.replace('rt\s+', '')
    df['Clean Tweet'] = df['Clean Tweet'].str.translate(str.maketrans('', '', string.punctuation))    
    
    return df

### Analysis Tasks

1. Aggregate stats - number of tweets, average length, series of number of tweets/hour
2. Histogram of lemmas
3. Classification of sentiment of each tweet with model fitted on sentiment140 and sum up neg & pos tweets (see FitSentModel.ipynb for how the code of the model in Keras)

## Task 1

In [37]:
def get_stats(df):
    avg_twt_len = round(df['Tweet content'].str.len().mean(), 0)

    stats = {
        'Count' : len(df),
        'Total Character Count': df['Tweet content'].str.len().sum(),
        'Time Period' : str(df.Date.min()) + ' -> ' + str(df.Date.max()),
        'Average Tweet Length' : avg_twt_len,
        'Average Number of RTs': round(df['RTs'].mean(), 0)
    }

    stats = pd.Series(stats, name='Tweet Stats')
    return stats

## Task 2

In [38]:
@ipp.require(Counter)
def get_top_lemmas(df, n):
    
    lemmatizer = WordNetLemmatizer()
    tokenizer = TweetTokenizer()
    
    def count_lemmas(text):
        lemmas = [lemmatizer.lemmatize(tok) for tok in tokenizer.tokenize(text)]
        return Counter(lemmas)
    
    tweet_lemmas = Counter()

    for tweet in df['Clean Tweet']:
        tweet_lemmas += count_lemmas(tweet)
        
    top_n = dict(sorted(tweet_lemmas.items(), key=lambda x: -x[1])[:n])
    return top_n


## Task 3

De-serialize the sentiment classifier and tokenizer

In [39]:
with open('tokenizer.pkl', 'rb') as f:
    keras_tokenizer = pickle.load(f)

MODEL_MAXCHARS = 40

In [40]:
def find_average_sentiment(df, sent_model=None):
    
    if sent_model is None:
        from tensorflow.keras.models import load_model

        ## Make sure to launch ipcluster in same directory
        ## otherwise the root path will be different.
        sent_model = load_model('sentmodel.h5')
    
    tweets = df['Clean Tweet'].sample(N_SENT_SAMPLES).values
    processed_test_data = pad_sequences(keras_tokenizer.texts_to_sequences(tweets),
                                        maxlen=MODEL_MAXCHARS)
    classified_sentiment = sent_model.predict_classes(processed_test_data)
    avg_sent = np.mean(classified_sentiment)
    return avg_sent

#print(f'Average sentiment of {N_SAMPLES} tweets about {company} : {avg_sent}')

## Functions for Displaying Results

In [41]:
def visualise_top_lemmas(top_lemmas, company):
    %matplotlib inline
    hist = plt.figure(figsize=(10,5))
    plt.bar(top_lemmas.keys(), top_lemmas.values())
    plt.xticks(rotation='vertical')
    plt.xlabel('Lemmas')
    plt.ylabel('Frequency')
    plt.title(company.capitalize())
    plt.show()

def show_results(company, stats, top_lemmas, avg_sent):
    print('\n' * 4)
    print(f'Results for {company.upper()}')
    print('Tweet Stats: ')
    display(HTML(stats.to_frame().to_html()))
    visualise_top_lemmas(top_lemmas, company)
    print(f'Average sentiment of {N_SENT_SAMPLES} tweets: {avg_sent}')
    
def show_all(results):
    for company, company_res in results.items():
        show_results(company, *company_res)

## Running tasks sequentially

In [42]:
tickers = ['amzn', 'msft', 'nflx', 'googl']

In [None]:
start = time.time()

results = {}
sent_model = load_model('sentmodel.h5')

for tick in tickers:
    print(tick.upper())
    df = preprocess(tick)
    print('Running Task 1')
    stats = get_stats(df)
    print('Running Task 2')
    top_lemmas = get_top_lemmas(df, 20)
    print('Running Task 3')
    avg_sent = find_average_sentiment(df, sent_model=sent_model)
    results[tick] = (stats, top_lemmas, avg_sent)

end = time.time()

print(f'Running Tasks sequentially took {end - start}')

In [None]:
show_all(results)

## Running in parallel

In [45]:
@ipp.require('os', preprocess, get_stats, get_top_lemmas, find_average_sentiment, create_logger)
def run_analysis(tick):
    df = preprocess(tick)
    stats = get_stats(df)

    #logger = create_logger(tick)
    #logger.info(f'Created stats for {tick}')
    
    top_lemmas = get_top_lemmas(df, 20)
    avg_sent = find_average_sentiment(df)
    return tick, (stats, top_lemmas, avg_sent)

In [None]:
client = ipp.Client()
dview = client[:]
dview.execute('''
              import pandas as pd
              import numpy as np
              
              from nltk.stem.wordnet import WordNetLemmatizer
              from nltk.tokenize import TweetTokenizer
              from tensorflow.keras.preprocessing.sequence import pad_sequences
              ''')

In [47]:
dview['keras_tokenizer'] = keras_tokenizer
dview['N_SENT_SAMPLES'] = N_SENT_SAMPLES
dview['WANTED_COLS'] = WANTED_COLS
dview['MODEL_MAXCHARS'] = MODEL_MAXCHARS

In [None]:
start = time.time()

ipp_results = dview.map_sync(run_analysis, tickers)

end = time.time()

print(f'Running Tasks in parallel took {end - start}')

In [None]:
show_all(dict(ipp_results))