In [0]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import json
from keras.preprocessing.text import Tokenizer, tokenizer_from_json
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
%tensorflow_version 1.x

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
!pwd

/content


In [6]:
!unzip '/content/drive/My Drive/Colab Notebooks/tokenizer_json.zip'

Archive:  /content/drive/My Drive/Colab Notebooks/tokenizer_json.zip
  inflating: tokenizer.json          


In [0]:
url_tok = re.compile(r'https?://\S+\b|www\.[^ ]+') # sub with '<URL>'
mention_tok = re.compile(r'@\w+') #sub with '<USER>'
neg_tok = re.compile(r"n't\b") # sub with " not"

def clean_tweet(tweet):

  cleaned = tweet.lower()
  
  # cleans up html encoding, ex. &amp; -> &
  cleaned = BeautifulSoup(tweet, 'lxml').get_text()

  try:
    cleaned = bytes(cleaned, encoding='latin_1').decode('utf-8-sig').replace(u"\ufffd", "?")
  except:
    cleaned = cleaned

  cleaned = neg_tok.sub(" not", cleaned)    
  cleaned = url_tok.sub('<URL>', cleaned)
  cleaned = mention_tok.sub('<USER>', cleaned)
  cleaned = re.sub("[^a-zA-Z<>:;\(\)]", " ", cleaned)

  return cleaned


In [0]:
max_len = 250

In [0]:
def clean_tweets_batch(tweets):
  return [clean_tweet(tweet) for tweet in tweets]

In [0]:
def load_tokenizer(json_file):
  with open(json_file) as j:
    return tokenizer_from_json(json.load(j))

In [0]:
tokenizer = load_tokenizer('tokenizer.json')

In [0]:
def load_twitter_model(h5_file):
  return load_model(h5_file)

In [0]:
model = load_twitter_model('/content/drive/My Drive/Colab Notebooks/twit-sent-model-orig-glove-embed-lstm-dense-20-epochs.h5')

In [0]:
def predict_with_threshold(probs, threshold):
  if threshold < 0.5:
    raise ValueError("Threshold must be 0.5 or greater")

  if threshold == 0.5:
    threshold == 0.50000000000000001

  total = 0
  positive = 0
  negative = 0

  length = len(probs)

  for i in range(0, length):
    if (probs[i] >= threshold):
      total = total + 1
      positive = positive + 1
  
    elif (probs[i] <= 1 - threshold):
      total = total + 1
      negative = negative + 1

  percent_positive = positive / total
  percent_ignored = ((length - total) / length)

  return [percent_positive, percent_ignored]

In [0]:
def predict_tweets_batch(tweets, metric='weighted', threshold=0.5):
  added_tweet = False

  if (len(tweets) == 1):
    tweets.append("Append tweet")
    added_tweet = True

  sequences = tokenizer.texts_to_sequences(clean_tweets_batch(tweets))
  data = pad_sequences(sequences, maxlen=250)

  probs = model.predict_proba(data)

  if added_tweet:
    probs = probs[:-1]

  if metric == 'weighted':
    return [np.mean(probs, dtype=np.float64), 0.0]
  elif metric == 'category':
    return predict_with_threshold(probs, threshold)
  else:
    raise ValueError("Metric types are ['weighted', 'category']")

In [28]:
predict_tweets_batch(["This is a single, well-written tweet", "This is a bad tweet!"], metric='category')

[0.5, 0.0]

In [29]:
predict_tweets_batch(["This is a single, well-written tweet", "This is a bad tweet!"], metric='weighted')

[0.46066431887447834, 0.0]