In [0]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import json
from keras.preprocessing.text import Tokenizer, tokenizer_from_json
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
import datetime
import time
import math
import matplotlib.pyplot as plt
import os
%matplotlib inline
%tensorflow_version 1.x

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
!unzip '/content/drive/My Drive/Colab Notebooks/project/tokenizer_json.zip'

Archive:  /content/drive/My Drive/Colab Notebooks/project/tokenizer_json.zip
  inflating: tokenizer.json          


In [0]:
def create_output_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [0]:
url_tok = re.compile(r'https?://\S+\b|www\.[^ ]+') # sub with '<URL>'
mention_tok = re.compile(r'@\w+') #sub with '<USER>'
neg_tok = re.compile(r"n't\b") # sub with " not"
space_tok = re.compile(r" {2,}") # sub with " "

def clean_tweet(tweet):

  cleaned = tweet.lower()
  
  # cleans up html encoding, ex. &amp; -> &
  cleaned = BeautifulSoup(cleaned, 'lxml').get_text()

  try:
    cleaned = bytes(cleaned, encoding='latin_1').decode('utf-8-sig').replace(u"\ufffd", "?")
  except:
    cleaned = cleaned

  cleaned = neg_tok.sub(" not", cleaned)    
  cleaned = url_tok.sub('<URL>', cleaned)
  cleaned = mention_tok.sub('<USER>', cleaned)
  cleaned = re.sub("[^a-zA-Z<>:;\(\)]", " ", cleaned)
  cleaned = space_tok.sub(" ", cleaned)

  return cleaned


In [0]:
# This must be 250 for our model
max_len = 250

In [0]:
def clean_tweets_batch(tweets):
  return [clean_tweet(tweet) for tweet in tweets]

In [0]:
def load_tokenizer(json_file):
  with open(json_file) as j:
    return tokenizer_from_json(json.load(j))

In [0]:
tokenizer = load_tokenizer('tokenizer.json')

In [0]:
def load_twitter_model(h5_file):
  return load_model(h5_file)

In [10]:
model = load_twitter_model('/content/drive/My Drive/Colab Notebooks/project/twit-sent-model-orig-glove-embed-lstm-dense-20-epochs.h5')





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.








Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




In [0]:
def predict_with_threshold(probs, threshold):
  if threshold < 0.5:
    raise ValueError("Threshold must be 0.5 or greater")

  if threshold == 0.5:
    threshold == 0.50000000000000001

  total = 0
  positive = 0
  negative = 0

  length = len(probs)

  for i in range(0, length):
    if (probs[i] >= threshold):
      total = total + 1
      positive = positive + 1
  
    elif (probs[i] <= 1 - threshold):
      total = total + 1
      negative = negative + 1

  percent_positive = positive / total
  percent_ignored = ((length - total) / length)

  return [percent_positive, percent_ignored]

In [0]:
def predict_tweets_batch(tweets, metric='weighted', threshold=0.5):
  added_tweet = False

  if (len(tweets) == 1):
    tweets.append("Append tweet")
    added_tweet = True

  sequences = tokenizer.texts_to_sequences(clean_tweets_batch(tweets))
  data = pad_sequences(sequences, maxlen=250)

  probs = model.predict_proba(data)

  if added_tweet:
    probs = probs[:-1]

  if metric == 'weighted':
    return [np.mean(probs, dtype=np.float64), 0.0]
  elif metric == 'category':
    return predict_with_threshold(probs, threshold)
  else:
    raise ValueError("Metric types are ['weighted', 'category']")

In [13]:
predict_tweets_batch(["damn ☹️ @packers lol *sigh*", "Well the Packers lose this one by 29 but we’ll"], metric='category')

[0.0, 0.0]

In [118]:
predict_tweets_batch(["I have having a great day"], metric='weighted')

[0.9984006285667419, 0.0]

In [0]:
def get_tweets_in_df(ff):
  with open(ff, encoding='utf-32') as ff_:
    text = ff_.read().split(',')
    return pd.DataFrame([{'time': tweet[:19], 'text': tweet[20:]} for tweet in text[:-1]])

In [15]:
!unzip week-12-13-tweets.zip
!unzip week-12-13-gametimes.zip

Archive:  week-12-13-tweets.zip
  inflating: week-12_LAR_BAL/week-12_LARvBAL_BAL-tweets.csv  
  inflating: week-12_LAR_BAL/week-12_LARvBAL_LAR+BAL-tweets.csv  
  inflating: week-12_LAR_BAL/week-12_LARvBAL_LAR-tweets.csv  
  inflating: week-12_NE_DAL/week-12_NEvDAL_DAL-tweets.csv  
  inflating: week-12_NE_DAL/week-12_NEvDAL_NE+DAL-tweets.csv  
  inflating: week-12_NE_DAL/week-12_NEvDAL_NE-tweets.csv  
  inflating: week-12_SF_GB/week-12_SFvGB_GB-tweets.csv  
  inflating: week-12_SF_GB/week-12_SFvGB_SF+GB-tweets.csv  
  inflating: week-12_SF_GB/week-12_SFvGB_SF-tweets.csv  
  inflating: week-13_DAL_BUF/week-13_DALvBUF_BUF-tweets.csv  
  inflating: week-13_DAL_BUF/week-13_DALvBUF_DAL+BUF-tweets.csv  
  inflating: week-13_DAL_BUF/week-13_DALvBUF_DAL-tweets.csv  
  inflating: week-13_DET_CHI/week-13_DETvCHI_CHI-tweets.csv  
  inflating: week-13_DET_CHI/week-13_DETvCHI_DET+CHI-tweets.csv  
  inflating: week-13_DET_CHI/week-13_DETvCHI_DET-tweets.csv  
  inflating: week-13_NO_ATL/week-13_NOvATL

In [0]:
def read_csv_tweets_to_df(ff):
  with open(ff, encoding='utf-32') as ff_:
    rows = ff_.read().split('\n')
    records = []
    for row in rows[1:]:
      fields = row.split(',')
      if not len(fields) == 3:
        continue
      records.append({'time': fields[0], 'text': fields[1], 'subject': fields[2]})
    
    return pd.DataFrame(records)

In [0]:
sf_tweets_df = read_csv_tweets_to_df("/content/week-12_SF_GB/week-12_SFvGB_SF-tweets.csv")

In [22]:
sf_tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19825 entries, 0 to 19824
Data columns (total 3 columns):
time       19825 non-null object
text       19825 non-null object
subject    19825 non-null object
dtypes: object(3)
memory usage: 464.8+ KB


In [0]:
sf_gb_time_df = pd.read_csv("/content/58073_SF_GB_week-12.csv")

In [24]:
sf_gb_time_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 693 entries, 0 to 692
Data columns (total 4 columns):
quarter        693 non-null object
clock          693 non-null object
real_time      693 non-null object
is_halftime    693 non-null bool
dtypes: bool(1), object(3)
memory usage: 17.0+ KB


In [0]:
def convert_datetime_str_to_datetime(dt_str):
    date_time = dt_str.split(" ")
    date = date_time[0]
    time = date_time[1].split(".")[0]

    date_comp = date.split("-")
    time_comp = time.split(":")

    return datetime.datetime(int(date_comp[0]), int(date_comp[1]), int(date_comp[2]), int(time_comp[0]), int(time_comp[1]), int(time_comp[2]))

In [0]:
def convert_datetime_str_to_float(dt_str):
  return time.mktime(convert_datetime_str_to_datetime(dt_str).timetuple())

In [0]:
def get_seconds_remaining_list(df):
  seconds_remaining = []
  for row in df.itertuples(index=False):
      if row[0] == "Halftime":
        continue

      qtr = int(row[0])
      clock = row[1].split(':')
      seconds = (4 - qtr) * 900 + int(clock[0]) * 60 + int(clock[1])
      seconds_remaining.append(seconds)

  return seconds_remaining

In [0]:
def get_float_time_list(df):
  return [convert_datetime_str_to_float(record[2]) for record in df.itertuples(index=False) if not record[0] == "Halftime"]

In [0]:
def create_seconds_remaining_float_time_df(game_time_df):
  temp_df = pd.DataFrame(list(zip(get_seconds_remaining_list(game_time_df), get_float_time_list(game_time_df))), columns=['seconds_remaining', 'float_time'])
  return temp_df.groupby('seconds_remaining').mean().sort_values('float_time')

In [0]:
def get_seconds_remaining_from_timestamp(fl_times, sec_rems, fl_time):
  if (fl_times[0] > fl_time or fl_times[-1] < fl_time): return None

  left = 0
  right = len(fl_times) - 1
  frac = 1.0
  index = round(right / 2)
  
  while left <= right:
    if fl_times[index] == fl_time:
      break
    if fl_times[index] > fl_time and fl_times[index - 1] < fl_time:
      diff = fl_times[index] - fl_times[index - 1]
      frac = (fl_time - fl_times[index - 1]) / diff
      break
    if fl_times[index] > fl_time:
      right = index - 1
    elif fl_times[index] < fl_time:
      left = index + 1
    index = round((right + left) / 2)

  return (sec_rems[index] * frac + sec_rems[index - 1] * (1 - frac))
    

In [0]:
def get_avg_timestamp_for_tweet_batch(times):
  total = 0
  length = len(times)

  for time in times:
    total = total + convert_datetime_str_to_float(time)

  return total / length

In [0]:
def get_key_timestamps_from_gametime_df(gametime_df):
  halftimes = gametime_df[gametime_df['quarter'] == "Halftime"]['real_time'].values
  htime_start = convert_datetime_str_to_float(halftimes[0])
  htime_end = convert_datetime_str_to_float(halftimes[-1])

  float_times = create_seconds_remaining_float_time_df(gametime_df)['float_time'].values
  game_start = float_times[0]
  game_end = float_times[-1]

  return game_start, htime_start, htime_end, game_end

In [0]:
def get_desired_tweets(tweets_df, gs, hs, he, ge):
  records = []
  for rec in tweets_df.itertuples(index=False):
    ftime = convert_datetime_str_to_float(rec[0])
    if (ftime > gs and ftime < hs) or (ftime > he and ftime < ge):
      records.append({'time': ftime, 'text':rec[1]})

  return pd.DataFrame(records).sort_values('time')

In [0]:
def get_tweets_in_range(tweets_df, start, end):
  records = []
  for rec in tweets_df.itertuples(index=False):
    ftime = convert_datetime_str_to_float(rec[0])
    if (ftime > start and ftime < end):
      records.append({'time': ftime, 'text':rec[1]})

  return pd.DataFrame(records).sort_values('time')

In [0]:
def get_ranges(df_length, batch_size):
  rem = df_length % batch_size
  ranges = []
  high = df_length
  low = df_length - rem

  if rem / batch_size < 0.5:
    low = low - batch_size

  ranges.append((low, high))
  high = low
  low = low - batch_size

  while low >= 0:
    ranges.append((low, high))
    high = low
    low = low - batch_size

  return ranges

In [0]:
def get_tweet_positivity_by_batch(tweets_df, gametime_df, batch_size=250, metric='weighted', threshold=0.5):
  gs, hs, he, ge = get_key_timestamps_from_gametime_df(gametime_df)
  fh_tweets = get_tweets_in_range(tweets_df, gs, hs)
  sh_tweets = get_tweets_in_range(tweets_df, he, ge)

  fl_sr_df = create_seconds_remaining_float_time_df(gametime_df)
  fl_times = fl_sr_df['float_time'].values
  sec_rems = fl_sr_df.index

  fh_ranges = get_ranges(len(fh_tweets), batch_size)
  sh_ranges = get_ranges(len(sh_tweets), batch_size)

  records = []

  for r in fh_ranges:
    data = fh_tweets[r[0]:r[1]]
    avg_time = data['time'].values.mean()
    output = predict_tweets_batch(data['text'].values, metric=metric, threshold=threshold)
    positivity = output[0]
    pct_ignored = output[1]
    game_sec_rem = get_seconds_remaining_from_timestamp(fl_times, sec_rems, avg_time)
    records.append({'seconds_remaining':game_sec_rem, 'positivity':positivity, 'percent_ignored':pct_ignored})

  for r in sh_ranges:
    data = sh_tweets[r[0]:r[1]]
    avg_time = data['time'].values.mean()
    output = predict_tweets_batch(data['text'].values, metric=metric, threshold=threshold)
    positivity = output[0]
    pct_ignored = output[1]
    game_sec_rem = get_seconds_remaining_from_timestamp(fl_times, sec_rems, avg_time)
    records.append({'seconds_remaining':game_sec_rem, 'positivity':positivity, 'percent_ignored':pct_ignored})

  return pd.DataFrame(records).sort_values('seconds_remaining', ascending=False)

In [0]:
def save_plot(df_wp, team, df_pos, nickname, week, root_dir):
    if not (team in ['home', 'away']):
        return None

    plt.clf()

    y_val = f"{team}_wp"
    time = df_wp['game_seconds_remaining']
    wp = df_wp[y_val]

    tweet_time = df_pos['seconds_remaining']
    tweet_pos = df_pos['positivity']

    plt.plot(time, wp, 'b', label='Win Probability')
    plt.plot(tweet_time, tweet_pos, 'r', label='Tweet Positivity')
    plt.title(f'Week {week} {nickname} Win Probability and Tweet Positivity')
    plt.xlim(3600, 0)
    plt.ylim(0, 1)
    plt.xlabel('Game Seconds Remaining')
    plt.ylabel('Tweet Positivity/Win Probability')
    plt.legend()
    plt.savefig(f"{root_dir}/week-{week}-{nickname}-wp-tp.png", bbox_inches='tight')

In [90]:
!unzip wp_data.zip

Archive:  wp_data.zip
  inflating: wp_data/week-12_LA_BAL_wp.csv  
  inflating: wp_data/week-12_NE_DAL_wp.csv  
  inflating: wp_data/week-12_SF_GB_wp.csv  
  inflating: wp_data/week-13_ATL_NO_wp.csv  
  inflating: wp_data/week-13_DAL_BUF_wp.csv  
  inflating: wp_data/week-13_DET_CHI_wp.csv  


In [0]:
def get_fname_from_path(file_path):
  index = 0
  try:
    index = file_path.rindex(os.sep) + 1
  except ValueError:
    index = 0
  
  return file_path[index:]

In [0]:
def create_tweets_csv_fname(week, home, away, team):
  return f"week-{week}_{home}v{away}_{team}-tweets.csv"

In [0]:
tweets_dirname_tok = re.compile(r"week-(\d{1,2})_([A-Z]{2,3})_([A-Z]{2,3})")

def create_wp_tp_plots_for_game(gametime_csv_path, wp_csv_path, tweets_dir, output_dir, metric='weighted', threshold=0.5):
  create_output_dir(output_dir)
  gametime_df = pd.read_csv(gametime_csv_path)
  wp_df = pd.read_csv(wp_csv_path)

  match = tweets_dirname_tok.search(get_fname_from_path(tweets_dir))
  week = match.group(1)
  home = match.group(2)
  away = match.group(3)

  for team in [[home, 'home'], [away, 'away']]:
    tweets_file = f"{tweets_dir}/{create_tweets_csv_fname(week, home, away, team[0])}"
    tweets_df = read_csv_tweets_to_df(tweets_file)
    pos_df = get_tweet_positivity_by_batch(tweets_df, gametime_df, metric=metric, threshold=threshold)

    save_plot(wp_df, team[1], pos_df, team[0], week, output_dir)

In [0]:
game_files = [['/content/58078_ATL_NO_week-13.csv', '/content/wp_data/week-13_ATL_NO_wp.csv', '/content/week-13_ATL_NO']]

In [0]:
for g in game_files:
  create_wp_tp_plots_for_game(g[0], g[1], g[2], '/content/drive/My Drive/Colab Notebooks/project/plots')