In [1]:
import pandas as pd
df = pd.read_csv("Data/Actual_Quarter.csv")

In [2]:
import nltk



In [3]:
def chunk_text(text, max_length=510):  # Using 510 to allow space for special tokens
    # Tokenize the text into words
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        if current_length + len(word.split()) <= max_length:
            current_chunk.append(word)
            current_length += len(word.split()) + 1  # +1 for the space
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = len(word.split()) + 1
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks


In [111]:
import re
import numpy as np
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans


def preprocess(plays):
    processed_plays = []
    for text in plays:
        events = text.split(':')
        cleaned_events = [re.sub(r'^\d+\s', '', event).strip() for event in events if event]
        processed_plays.extend(cleaned_events)
    return processed_plays

def encode_plays(processed_plays, tokenizer, model, n_components=None):
    enc_plays = []
    with torch.no_grad():
        for play in processed_plays:
            inputs = tokenizer(play, return_tensors='pt', padding=True, truncation=True, max_length=512)
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            enc_plays.append(embeddings)
    
    all_embeddings = np.vstack(enc_plays)
    
    # Automatically adjust n_components if not set
    if n_components is None or n_components > min(all_embeddings.shape):
        n_components = min(all_embeddings.shape) - 1
    
    # Apply PCA only if it makes sense to do so
    if n_components > 0:
        pca = PCA(n_components=n_components)
        reduced_embeddings = pca.fit_transform(all_embeddings)
    else:
        reduced_embeddings = all_embeddings  # Skip PCA if n_components <= 0
    
    return reduced_embeddings


def cluster_and_summarize(plays, num_clusters=5):
    # Ensure number of clusters is within the desired range for summary length
    num_clusters = max(4, min(num_clusters, len(plays), 8))  # Limits clusters to 4-8

    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertModel.from_pretrained('distilbert-base-uncased')  

    processed_plays = preprocess(plays)
    reduced_embeddings = encode_plays(processed_plays, tokenizer, model, n_components=64)  # Example component reduction
    
    kmeans = MiniBatchKMeans(n_clusters=num_clusters)
    kmeans.fit(reduced_embeddings)
    labels = kmeans.labels_
    centers = kmeans.cluster_centers_

    summary = []
    for i in range(num_clusters):
        cluster_indices = np.where(labels == i)[0]
        if not cluster_indices.size:
            continue
        cluster_embeddings = [reduced_embeddings[j] for j in cluster_indices]
        center = centers[i]
        distances = np.linalg.norm(cluster_embeddings - center, axis=1)
        closest_index = cluster_indices[np.argmin(distances)]
        summary.append(processed_plays[closest_index])

    # Further refine the summary if needed
    summary = ' '.join(summary).split('. ')[:8]  # Ensure only up to 8 sentences
    return '. '.join(summary) + '.' if summary else "Unable to generate a summary."

# Example usage
plays = [
    '695 N. Miroti misses 2-pt layup from 2 ft:694 Defensive rebound by K. Love:683 K. Love makes 2-pt dunk at rim (assist by L. James):671 D. Rose misses 2-pt jump shot from 3 ft (block by T. Mozgov):669 Defensive rebound by T. Mozgov:663 M. Williams makes 2-pt jump shot from 21 ft'
]
summary = cluster_and_summarize(plays, num_clusters=5)
print("Summary:", summary)



Summary: N. Miroti misses 2-pt layup from 2 ft K. Love makes 2-pt dunk at rim (assist by L. James) Defensive rebound by K. Love M. Williams makes 2-pt jump shot from 21 ft.


In [113]:
plays = df["CleanGameText"].iloc[0:4].tolist()

plays

['N. Miroti misses 2pt layup from  ft.  Defensive rebound by K. Love.  K. Love makes 2pt dunk at rim assist by L. James.  D. Rose misses 2pt jump shot from  ft block by T. Mozgov.  Defensive rebound by T. Mozgov.  M. Williams makes 2pt jump shot from  ft.  P. Gasol misses 2pt jump shot from  ft.  Defensive rebound by L. James.  L. James makes 2pt jump shot from  ft.  Personal foul by J. Smith drawn by T. Snell.  N. Miroti misses 2pt layup from  ft.  Offensive rebound by N. Miroti.  N. Miroti makes 2pt layup from  ft.  L. James misses 2pt jump shot from  ft.  Defensive rebound by N. Miroti.  D. Rose makes 2pt layup from  ft.  L. James misses 2pt jump shot from  ft.  Offensive rebound by T. Mozgov.  T. Mozgov makes 2pt layup from  ft.  D. Rose misses 2pt layup from  ft.  Defensive rebound by M. Williams.  L. James makes 2pt dunk at rim assist by M. Williams.  J. Butler misses 2pt jump shot from  ft.  Defensive rebound by K. Love.  K. Love misses 2pt layup from  ft.  Offensive rebound by 

Plays must be in a list format

In [107]:
summary = cluster_and_summarize(plays, num_clusters=3)

In [109]:
summary

'Turnover by D. Rose bad pass.  steal by L. James.  Turnover by L. James bad pass.  steal by P. Gasol.'

THis takes too long

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df