# Introduction
Define the problem \
Preprocess the data \
Extract features \
Prepare the data for NLP tasks

#### 1. Import neccesary libraries

In [26]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

import numpy as np
import tensorflow as tf
import keras

from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split

from keras.layers import Embedding, Input, Dense, Lambda
from keras.models import Model

import nltk
from nltk.data import find

import matplotlib.pyplot as plt

import re

import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import pandas as pd

In [27]:
!pip install pydot --quiet
!pip install gensim --quiet
!pip install tensorflow-datasets --quiet
!pip install tensorflow-text --quiet
!pip install gensim==4.3.2
!pip install scipy==1.11.4



In [28]:
data = pd.read_json("../dataset/tagged_transcripts.json")

#### 2. Exploratory Data Analysis

In [29]:
data.head(5)

Unnamed: 0,1962-houston_oilers-dallas_texans.txt,1969-chicago_bears-green_bay_packers.txt,1969-cleveland_browns-minnesota_vikings-1.txt,1969-cleveland_browns-minnesota_vikings.txt,1969-new_york_jets-baltimore_colts.txt,1970-baltimore_colts-kansas_city_chiefs.txt,1970-cleveland_browns-new_york_jets.txt,1970-dallas_cowboys-detroit_lions.txt,1970-kansas_city_chiefs-baltimore_colts.txt,1970-los_angeles_rams-minnesota_vikings-1.txt,...,2018-tampa_bay_buccaneers-dallas_cowboys.txt,2018-tampa_bay_buccaneers-detroit_lions.txt,2018-tennessee_titans-green_bay_packers.txt,2018-tennessee_titans-minnesota_vikings.txt,2018-tennessee_titans-pittsburgh_steelers.txt,2018-tennessee_titans-tampa_bay_buccaneers.txt,2018-washington_redskins-new_england_patriots.txt,2018-washington_redskins-new_york_jets.txt,2018-washington_redskins-philadelphia_eagles-1.txt,2018-washington_redskins-philadelphia_eagles.txt
teams,"[houston_oilers, dallas_texans]","[chicago_bears, green_bay_packers]","[cleveland_browns, minnesota_vikings]","[cleveland_browns, minnesota_vikings]","[new_york_jets, baltimore_colts]","[baltimore_colts, kansas_city_chiefs]","[cleveland_browns, new_york_jets]","[dallas_cowboys, detroit_lions]","[kansas_city_chiefs, baltimore_colts]","[los_angeles_rams, minnesota_vikings]",...,"[tampa_bay_buccaneers, dallas_cowboys]","[tampa_bay_buccaneers, detroit_lions]","[tennessee_titans, green_bay_packers]","[tennessee_titans, minnesota_vikings]","[tennessee_titans, pittsburgh_steelers]","[tennessee_titans, tampa_bay_buccaneers]","[washington_redskins, new_england_patriots]","[washington_redskins, new_york_jets]","[washington_redskins, philadelphia_eagles]","[washington_redskins, philadelphia_eagles]"
transcript,gilson well defend the goal on your left theyl...,cbs television sports presents the national fo...,the nfl today brought to you by the foundation...,the nfl today brought to you by the foundation...,&gt;&gt; nbc sports presents the third nflafl ...,biochemistry was almost an that i doing it cam...,from municipal stadium in cleveland ohio to po...,a long time ago ford motor company had a bette...,from memorial stadium in baltimore maryland na...,from metropolitan stadium in bloomington minne...,...,you welcomes you to the following presentation...,well the rain continues to fall but we have fo...,so the first preseason game a couple weeks at ...,time is running out for some opportunity has c...,heinz field and the new head coach of the tita...,tennessee titans preseason football is brought...,the patriots take the field and football retur...,espn welcomes you to the following presentatio...,and there is nick falls the eagle fans at atte...,skins and eagles theyve been division rivals d...
year,1962,1969,1969,1969,1969,1970,1970,1970,1970,1970,...,2018,2018,2018,2018,2018,2018,2018,2018,2018,2018


In [30]:
nltk.download('word2vec_sample')

word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))

wvmodel = KeyedVectors.load_word2vec_format(datapath(word2vec_sample), binary=False)

[nltk_data] Downloading package word2vec_sample to
[nltk_data]     /Users/tommayer/nltk_data...
[nltk_data]   Package word2vec_sample is already up-to-date!


In [31]:
len(wvmodel)

43981

So this dataset has 43,981 games over the years.

#### 3. Data Preprocessing

##### 3.1: Preprocess text:
- remove punctuation
- replace with space 
- also lowercase 
- transpose it too (with a few simple pandas operations to keep order)

In [32]:
data_transposed = data.T.reset_index().rename(columns={'index': 'game_id'}) # pd operation
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    tokens = text.split()
    return tokens

# Apply preprocessing to each transcript
data_transposed['tokens'] = data_transposed['transcript'].apply(preprocess_text)

In [33]:
data_transposed.head()

Unnamed: 0,game_id,teams,transcript,year,tokens
0,1962-houston_oilers-dallas_texans.txt,"[houston_oilers, dallas_texans]",gilson well defend the goal on your left theyl...,1962,"[gilson, well, defend, the, goal, on, your, le..."
1,1969-chicago_bears-green_bay_packers.txt,"[chicago_bears, green_bay_packers]",cbs television sports presents the national fo...,1969,"[cbs, television, sports, presents, the, natio..."
2,1969-cleveland_browns-minnesota_vikings-1.txt,"[cleveland_browns, minnesota_vikings]",the nfl today brought to you by the foundation...,1969,"[the, nfl, today, brought, to, you, by, the, f..."
3,1969-cleveland_browns-minnesota_vikings.txt,"[cleveland_browns, minnesota_vikings]",the nfl today brought to you by the foundation...,1969,"[the, nfl, today, brought, to, you, by, the, f..."
4,1969-new_york_jets-baltimore_colts.txt,"[new_york_jets, baltimore_colts]",&gt;&gt; nbc sports presents the third nflafl ...,1969,"[gtgt, nbc, sports, presents, the, third, nfla..."


Now, we have a column of tokens that we can use to get the game commentary embeddings.  We also have each game as a different row making it easier to work with our data.

Let's get the game commentary embeddings. Essentially, computers can do math much more easily with numbers than with text.  So, we'll convert the text into numbers saving compute with a pretrained model (word2vec that I called wvmodel).

In [34]:
def get_document_embedding(tokens, model):
    # Filter tokens to only those in the model's vocabulary
    valid_tokens = [token for token in tokens if token in model.key_to_index] # or could 
    if not valid_tokens:
        return np.zeros(model.vector_size)
    # Average the word vectors
    return np.mean([model[token] for token in valid_tokens], axis=0) # average!

# Apply to each game transcript
data_transposed['doc_embedding'] = data_transposed['tokens'].apply(
    lambda tokens: get_document_embedding(tokens, wvmodel) 
) # apply the function to each row in my df

I have averaged the word embeddings in each document to get a single embedding for the document.  This gets me the 'average' word embedding for each game.  This is a simple yet powerful baseline model.  It can show us the semantic meaning for each game.

Now, we can create the embedding matrix.  This converts the w2v model, that we are using already, into a matrix that we can use for our model. Then, we build a vocabulary dictionary that we can use to map the words to their corresponding indices.  We cannot forget to add the unknown token to the vocabulary dictionary too.

In [35]:
EMBEDDING_DIM = len(wvmodel['university'])      # we know... it's 300

# initialize embedding matrix and word-to-id map:
embedding_matrix = np.zeros((len(wvmodel) + 1, EMBEDDING_DIM))
vocab_dict = {}

# build the embedding matrix and the word-to-id map:
for i, word in enumerate(wvmodel.index_to_key):
    embedding_vector = wvmodel[word]

    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        vocab_dict[word] = i

# we can use the last index at the end of the vocab for unknown tokens
vocab_dict['[UNK]'] = len(vocab_dict)

In [36]:
# take a peek at the embedding matrix
embedding_matrix.shape


(43982, 300)

In [37]:
# and take a look at the first embedding vector, a game from 1962!
embedding_matrix[0]

array([ 0.0891758 ,  0.121832  , -0.0671959 ,  0.0477279 , -0.013659  ,
       -0.0671959 ,  0.0640559 , -0.0331269 , -0.0364239 ,  0.00565199,
       -0.017113  , -0.10362   ,  0.0552639 , -0.00706499, -0.0643699 ,
        0.00753598, -0.0866638 ,  0.0492979 , -0.0816398 , -0.0910598 ,
        0.00416049, -0.0681379 ,  0.0568339 ,  0.0524379 ,  0.00143262,
       -0.01256   , -0.0775578 ,  0.0960838 ,  0.0555779 , -0.0734758 ,
       -0.013659  , -0.0376799 , -0.0489839 , -0.0470999 , -0.102992  ,
        0.00612299,  0.0452159 , -0.0356389 ,  0.0665679 ,  0.0747318 ,
        0.0759878 , -0.0248059 ,  0.013031  , -0.00490624,  0.00733973,
       -0.0351679 ,  0.00639774, -0.00370912,  0.0835238 ,  0.0477279 ,
       -0.0885478 , -0.0929438 ,  0.0634279 ,  0.0741038 ,  0.00561274,
       -0.0192325 ,  0.0803838 ,  0.00580899,  0.0923158 ,  0.0700219 ,
        0.0266899 ,  0.0788138 , -0.0634279 , -0.0470999 ,  0.0835238 ,
       -0.0483559 ,  0.0574619 ,  0.0411339 ,  0.00455299,  0.07

##### 3.2 Which broadcast was each game on? Search and add the feature.

I need to identiy the TV network broadcast in which each game takes place.  By network contract, the commentators must present the network names during the broadcast.  I will search the for the broacast in the tokenized game transcripts.

In [38]:
# Here are the channels that broadcast NFL games
channels = ['NBC', 'CBS', 'ESPN', 'FOX', 'ABC', 'NFL Network']

##### 3.2.1 Simple Channel Mention Model
I will start with a simple method for extracting the channel.  When broadcasting the game, the commentators must present the channel multiple times promoting which station is in charge of the game's presentation.  This model merely counts the mentions of the possibile broadcasts and labels the game with the highest mentioned channel.

In [39]:
# pandas rename col for now
data_transposed = data_transposed.rename(columns={'broadcaster_simpleModel': 'broadcaster'})

# If you haven't run the preprocessing steps yet, let's do that first:
def get_most_mentioned_network(transcript):
    networks = {
        'CBS': transcript.upper().count('CBS'),
        'FOX': transcript.upper().count('FOX'),
        'NBC': transcript.upper().count('NBC'),
        'ESPN': transcript.upper().count('ESPN'),
        'ABC': transcript.upper().count('ABC')
    }
    
    most_mentioned = max(networks.items(), key=lambda x: x[1])
    
    # Check for ties
    max_count = most_mentioned[1]
    if max_count == 0:
        return 'Unknown'
    
    tied_networks = [network for network, count in networks.items() if count == max_count]
    if len(tied_networks) > 1:
        return 'Tie: ' + '/'.join(tied_networks)
    
    if most_mentioned[1] > 0:
        return most_mentioned[0]
    return 'Unknown'

# Add broadcaster column if it doesn't exist
if 'broadcaster' not in data_transposed.columns:
    data_transposed['broadcaster'] = data_transposed['transcript'].apply(get_most_mentioned_network)

# Verify the data
print("Available columns:", data_transposed.columns)
print("\nBroadcaster distribution:")
print(data_transposed['broadcaster'].value_counts())

Available columns: Index(['game_id', 'teams', 'transcript', 'year', 'tokens', 'doc_embedding',
       'broadcaster'],
      dtype='object')

Broadcaster distribution:
broadcaster
ESPN             397
FOX              291
CBS              263
Unknown          220
ABC              166
NBC               99
Tie: ESPN/ABC     10
Tie: FOX/ESPN      4
Tie: FOX/NBC       2
Tie: NBC/ESPN      1
Tie: FOX/ABC       1
Tie: CBS/NBC       1
Name: count, dtype: int64


Remove the games that the broadcast is unknown or there is a tie.

In [40]:
data_transposed = data_transposed[data_transposed['broadcaster'] != 'Unknown']

In [41]:
data_transposed = data_transposed[~data_transposed['broadcaster'].str.contains('Tie')]

In [42]:
data_transposed.head()

Unnamed: 0,game_id,teams,transcript,year,tokens,doc_embedding,broadcaster
0,1962-houston_oilers-dallas_texans.txt,"[houston_oilers, dallas_texans]",gilson well defend the goal on your left theyl...,1962,"[gilson, well, defend, the, goal, on, your, le...","[0.02728455, 0.016727475, 0.0260244, 0.0380681...",ABC
1,1969-chicago_bears-green_bay_packers.txt,"[chicago_bears, green_bay_packers]",cbs television sports presents the national fo...,1969,"[cbs, television, sports, presents, the, natio...","[0.030220592, 0.014963325, 0.02284711, 0.03831...",CBS
2,1969-cleveland_browns-minnesota_vikings-1.txt,"[cleveland_browns, minnesota_vikings]",the nfl today brought to you by the foundation...,1969,"[the, nfl, today, brought, to, you, by, the, f...","[0.027876755, 0.016259313, 0.022658505, 0.0399...",CBS
3,1969-cleveland_browns-minnesota_vikings.txt,"[cleveland_browns, minnesota_vikings]",the nfl today brought to you by the foundation...,1969,"[the, nfl, today, brought, to, you, by, the, f...","[0.028167814, 0.016339412, 0.022509856, 0.0396...",CBS
4,1969-new_york_jets-baltimore_colts.txt,"[new_york_jets, baltimore_colts]",&gt;&gt; nbc sports presents the third nflafl ...,1969,"[gtgt, nbc, sports, presents, the, third, nfla...","[0.031091398, 0.015320361, 0.02407883, 0.03909...",NBC


In [43]:
data_transposed.to_json("../dataset/preprocessed_data_full.json")

#### Where we started:
A dataset with unique game ids as a header, the transcript, the teams, and the year.

#### Where the data is now:
A unique game id columns, the two teams, a tokenized transcript with NLP preprocessing tactics implemented, document embeddings for each word, and the broadcast of each game.

### Subset of the data to work with for testing

In [44]:
total_games = data.shape[1]  # number of columns
sample_size = int(total_games * 0.05)
print(f"Total number of games: {total_games}")
print(f"5% of games: {sample_size}")

# Take a random sample of 5% of the games
np.random.seed(42)  # Set a random seed for reproducibility
sampled_columns = np.random.choice(data.columns, size=sample_size, replace=False)
sampled_data = data[sampled_columns]

Total number of games: 1455
5% of games: 72


In [45]:
sample_size = int(total_games * 0.1)
print(f"Total number of games: {total_games}")
print(f"Subsetting with 10% of games: {sample_size}")

Total number of games: 1455
Subsetting with 10% of games: 145


In [46]:
total_games = data_transposed.shape[0]
print(f"Total number of games: {total_games}")

Total number of games: 1216


Choose a random selection of 10% of the dataframe's rows.

In [47]:
np.random.seed(10)
sampled_indices = np.random.choice(data_transposed.index, size=sample_size, replace=False)
sample_data = data_transposed.loc[sampled_indices]

In [48]:
sample_data.head()

Unnamed: 0,game_id,teams,transcript,year,tokens,doc_embedding,broadcaster
31,1972-oakland_raiders-new_york_jets.txt,"[oakland_raiders, new_york_jets]",from the goodyear blimp columbia on a beautifu...,1972,"[from, the, goodyear, blimp, columbia, on, a, ...","[0.03007982, 0.017491112, 0.02206675, 0.040843...",ABC
561,2007-michigan-usc-1.txt,"[michigan, usc]",seven a powerful experiment that could change ...,2007,"[seven, a, powerful, experiment, that, could, ...","[0.027472984, 0.017659254, 0.023996452, 0.0405...",ABC
1101,2015-pittsburgh_steelers-new_england_patriots.txt,"[pittsburgh_steelers, new_england_patriots]",i never thought the trophy could feel as speci...,2015,"[i, never, thought, the, trophy, could, feel, ...","[0.02733554, 0.016897388, 0.023208328, 0.04088...",FOX
86,1978-arkansas-kansas.txt,"[arkansas, kansas]",football team the record indicates how good a ...,1978,"[football, team, the, record, indicates, how, ...","[0.03179858, 0.02170769, 0.022047535, 0.028453...",ABC
1142,2016-baltimore_ravens-oakland_raiders-2.txt,"[baltimore_ravens, oakland_raiders]",thousand par with two touchdown passes in the ...,2016,"[thousand, par, with, two, touchdown, passes, ...","[0.028677901, 0.016341358, 0.020990586, 0.0405...",CBS


In [50]:
sample_data.to_json("../dataset/preprocessed_data_subset.json")