This notebook will comprise of an analysis of errors that the models made.

In [150]:
# Import Libraries
import pandas as pd
import numpy as np
import re
import pickle
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [151]:
# Import Files
train_df = pd.read_csv("../data/train.csv")
val_df = pd.read_csv("../data/validation.csv")
test_df = pd.read_csv("../data/test.csv")

bnb = pd.read_csv("bnb_results.csv")
lr = pd.read_csv("lr_results.csv")
cnn = pd.read_csv("cnn_results.csv")
bert = pd.read_csv("bert_results.csv")

### Get Models

In [None]:
# Recreate the NB
nb_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2), binary=True)
basic_vectorizer = CountVectorizer()
train_x = nb_vectorizer.fit_transform(train_df["Stemmed"])
train_y = train_df["Result_Bin"]

nb_model = BernoulliNB()
nb_model.fit(train_x, train_y)

In [153]:
# Tool for finding probabilities associated with W/L
def find_prob_nb(text, label):
    words = process_text(text).split()
    print(words)
    evidence = dict()
    df = train_df[train_df["Result"] == label]
    for word in words:
        word_count = train_df['Stemmed'].str.contains(word, case=False).sum()
        evidence[word] = word_count/len(train_df)

    likelihood = dict()
    for word in words:
        word_count = df['Stemmed'].str.contains(word, case=False).sum()
        likelihood[word] = word_count/len(df)
    prior = len(df)/len(train_df)
    
    ratio = dict()

    for key, value in likelihood.items():
        if key in evidence:
            ratio[key] = value / evidence[key]
    post = dict()
    post = {key: value * prior for key, value in ratio.items()}

    print(post)
        
    

In [154]:
lr_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2))
train_x = lr_vectorizer.fit_transform(train_df["Stemmed"])
train_y = train_df["Result_Bin"]

lr_model = LogisticRegression()
lr_model.fit(train_x, train_y)

In [155]:
# Gets Coefficients for LR
def return_coef_lr(input):

    model = lr_model
    text = process_text(input)
    text_vec = lr_vectorizer.transform([text])

    vocabulary = lr_vectorizer.get_feature_names_out()
    coefficients = lr_model.coef_[0]

    
    word_coefficient_map = {word: coef for word, coef in zip(vocabulary, coefficients)}

   
    for word in text.split():
        if word in word_coefficient_map:
            print(f"Word: {word}, Coefficient: {word_coefficient_map[word]}")
        else:
            print(f"Word: {word}, Coefficient: 0")  

In [156]:
# Processing Text

def process_text(document):
    # Tokenize the document
    tokens = document.split()
    tokens = [re.sub(r'^\W+|\W+$', '', token) for token in tokens]
    tokens = [token.lower() for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stem the tokens
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    # Return the processed text
    return ' '.join(stemmed_tokens)

In [157]:
bnb.head()

Unnamed: 0.1,Unnamed: 0,Comment,Result,Comment_Adj,No_Stop,Stemmed,Result_Bin,Predicted_Result,Predicted_Probability
0,0,I feel like we became too dependent on our rec...,Loss,i feel like we became too dependent on our rec...,feel like became dependent recent late-game he...,feel like becam depend recent late-gam heroic ...,0,1,0.578302
1,1,I like it even more when I don't think we're g...,Win,i like it even more when i don't think we're g...,like even n't think 're gon na win win anyways,like even n't think 're gon na win win anyway,1,1,0.999777
2,2,This game confirmed everything I already knew ...,Loss,this game confirmed everything i already knew ...,game confirmed everything already knew bobby ’...,game confirm everyth alreadi knew bobbi ’ play...,0,0,0.30015
3,3,I’m in shock as a Blazers fan. I know for a fa...,Loss,i’m in shock as a blazers fan i know for a fac...,’ shock blazers fan know fact lillard play way...,’ shock blazer fan know fact lillard play way ...,0,1,0.999483
4,4,Can we please change the banner to our current...,Win,can we please change the banner to our current...,please change banner current bucks roster ’ wa...,pleas chang banner current buck roster ’ want ...,1,1,0.999648


In [158]:
lr.head()

Unnamed: 0.1,Unnamed: 0,Comment,Result,Comment_Adj,No_Stop,Stemmed,Result_Bin,Predicted_Result,Predicted_Probability
0,0,I feel like we became too dependent on our rec...,Loss,i feel like we became too dependent on our rec...,feel like became dependent recent late-game he...,feel like becam depend recent late-gam heroic ...,0,0,0.225801
1,1,I like it even more when I don't think we're g...,Win,i like it even more when i don't think we're g...,like even n't think 're gon na win win anyways,like even n't think 're gon na win win anyway,1,1,0.911172
2,2,This game confirmed everything I already knew ...,Loss,this game confirmed everything i already knew ...,game confirmed everything already knew bobby ’...,game confirm everyth alreadi knew bobbi ’ play...,0,0,0.305607
3,3,I’m in shock as a Blazers fan. I know for a fa...,Loss,i’m in shock as a blazers fan i know for a fac...,’ shock blazers fan know fact lillard play way...,’ shock blazer fan know fact lillard play way ...,0,1,0.705575
4,4,Can we please change the banner to our current...,Win,can we please change the banner to our current...,please change banner current bucks roster ’ wa...,pleas chang banner current buck roster ’ want ...,1,1,0.95809


In [159]:
cnn.head()

Unnamed: 0.1,Unnamed: 0,Comment,Result,Comment_Adj,No_Stop,Stemmed,Result_Bin,Actual_Label,Predicted_Label
0,0,I feel like we became too dependent on our rec...,Loss,i feel like we became too dependent on our rec...,feel like became dependent recent late-game he...,feel like becam depend recent late-gam heroic ...,0,0,1
1,1,I like it even more when I don't think we're g...,Win,i like it even more when i don't think we're g...,like even n't think 're gon na win win anyways,like even n't think 're gon na win win anyway,1,1,1
2,2,This game confirmed everything I already knew ...,Loss,this game confirmed everything i already knew ...,game confirmed everything already knew bobby ’...,game confirm everyth alreadi knew bobbi ’ play...,0,0,0
3,3,I’m in shock as a Blazers fan. I know for a fa...,Loss,i’m in shock as a blazers fan i know for a fac...,’ shock blazers fan know fact lillard play way...,’ shock blazer fan know fact lillard play way ...,0,0,0
4,4,Can we please change the banner to our current...,Win,can we please change the banner to our current...,please change banner current bucks roster ’ wa...,pleas chang banner current buck roster ’ want ...,1,1,1


In [160]:
bert.head()

Unnamed: 0.1,Unnamed: 0,Comment,Result,Comment_Adj,No_Stop,Stemmed,Result_Bin,Predicted
0,0,I feel like we became too dependent on our rec...,Loss,i feel like we became too dependent on our rec...,feel like became dependent recent late-game he...,feel like becam depend recent late-gam heroic ...,0,1
1,1,I like it even more when I don't think we're g...,Win,i like it even more when i don't think we're g...,like even n't think 're gon na win win anyways,like even n't think 're gon na win win anyway,1,1
2,2,This game confirmed everything I already knew ...,Loss,this game confirmed everything i already knew ...,game confirmed everything already knew bobby ’...,game confirm everyth alreadi knew bobbi ’ play...,0,0
3,3,I’m in shock as a Blazers fan. I know for a fa...,Loss,i’m in shock as a blazers fan i know for a fac...,’ shock blazers fan know fact lillard play way...,’ shock blazer fan know fact lillard play way ...,0,0
4,4,Can we please change the banner to our current...,Win,can we please change the banner to our current...,please change banner current bucks roster ’ wa...,pleas chang banner current buck roster ’ want ...,1,0


## Evaluating False Negatives for LR, BERT, and CNN

In [161]:
bnb_tp = bnb[(bnb["Result"] == "Win") & (bnb["Predicted_Result"] == 1)]
lr_fn = lr[(lr["Result"] == "Win") & (lr["Predicted_Result"] == 0)]
cnn_fn = cnn[(cnn["Result"] == "Win") & (cnn["Predicted_Label"] == 0)]
bert_fn = bert[(bert["Result"] == "Win") & (bert["Predicted"] == 0)]

In [162]:
bnb_comments = set(bnb_tp["Comment"])
lr_comments = set(lr_fn["Comment"])
cnn_comments = set(cnn_fn["Comment"])
bert_comments = set(bert_fn["Comment"])

common_comments = lr_comments.intersection(cnn_comments, bert_comments, bnb_comments)

for i, comment in enumerate(common_comments):
    if i < 4:
        print(comment)
    else:
        break

Yep people still calling for trades and firing
Dame’s game is really unbelievable… He’s the best logo shooter of all time statistically. If you come out too far, he’ll explode right past you and shoot or get a dunk/layup/foul. If you grab at him, he’s the best freethrow shooter in the league this season (and headed to 4th career all-time). If your teammate comes over to help, he is an elite playmaker and will make the right pass more often than not. He’s like prime Harden but much smaller, better at shooting, and way more clutch.
on ESPN they said could be the next game or the game after next
Malik is playing well, sending him to the bench now will throw him off


In [163]:
target_comment = "Malik is playing well, sending him to the bench now will throw him off"
return_coef_lr(target_comment)

Word: malik, Coefficient: -0.5406748829539177
Word: play, Coefficient: -0.056864372549471444
Word: well, Coefficient: -0.24822874624598537
Word: send, Coefficient: 0.4619140404395336
Word: bench, Coefficient: -0.3270444798255762
Word: throw, Coefficient: -0.0845347188654442


In [164]:
find_prob_nb(target_comment, "Win")

['malik', 'play', 'well', 'send', 'bench', 'throw']
{'malik': 0.43478260869565216, 'play': 0.4826001313197636, 'well': 0.4766666666666667, 'send': 0.6428571428571427, 'bench': 0.5251396648044693, 'throw': 0.5132743362831859}


## Evaluating False Positives

In [165]:
bnb_fp = bnb[(bnb["Result"] == "Loss") & (bnb["Predicted_Result"] == 1)].sort_values("Predicted_Probability", ascending = False)
lr_tn = lr[(lr["Result"] == "Loss") & (lr["Predicted_Result"] == 0)]
cnn_tn = cnn[(cnn["Result"] == "Loss") & (cnn["Predicted_Label"] == 0)]
bert_tn = bert[(bert["Result"] == "Loss") & (bert["Predicted"] == 0)]

In [166]:
bnb_comments = set(bnb_fp["Comment"])
lr_comments = set(lr_tn["Comment"])
cnn_comments = set(cnn_tn["Comment"])
bert_comments = set(bert_tn["Comment"])

common_comments = lr_comments.intersection(cnn_comments, bert_comments, bnb_comments)

for i, comment in enumerate(common_comments):
    if i < 10:
        print(comment)
    else:
        break

None of us know what was at the root of the tension between these guys. They might not even understand it themselves.
Dame is checked out and is perpetually lazy, apathetic, and downright stupid on the court. It’s miserable to watch. I miss Jrue, and Khris is better than Dame right now.
Don’t let a regular season game in November fool you if both teams matched up in the playoffs the Bucks beat this team in 5
2 FOR 19 FROM 3 for Lopez/Dame/Khris…unacceptable
I’ll do u one better. The guy hasn’t made it out of the 2nd round since the big three celtics.
The Bucks have played the easiest schedule in the NBA so far by a pretty decent margin too.
Doc has to.be trolling us right?  His post game press conference he talked about getting Pat more minutes so he's ready and confident down the stretch.
Both Pacers and Heat have the shooters to have decent odds of a hot streak from 3 taking some games they shouldnt.
We won’t see the second round lmao
If you blame the bench, you don't know basketball

In [171]:
target_comment = "Dame is checked out and is perpetually lazy, apathetic, and downright stupid on the court. It’s miserable to watch. I miss Jrue, and Khris is better than Dame right now"

return_coef_lr(target_comment)

Word: dame, Coefficient: -0.1700790422176605
Word: check, Coefficient: 0.24744862055832584
Word: perpetu, Coefficient: -0.027956271452183712
Word: lazi, Coefficient: -0.23844876824699507
Word: apathet, Coefficient: 0
Word: downright, Coefficient: 0
Word: stupid, Coefficient: -0.6631070629775302
Word: court, Coefficient: 0.146007882187147
Word: it’, Coefficient: 0
Word: miser, Coefficient: 0.11285656253966668
Word: watch, Coefficient: -0.25341505528863745
Word: miss, Coefficient: -0.4716422624688569
Word: jrue, Coefficient: -0.33698888558356804
Word: khri, Coefficient: 0.34432296915886257
Word: better, Coefficient: 0.15519712399002292
Word: dame, Coefficient: -0.1700790422176605
Word: right, Coefficient: -0.2621512216885956


In [170]:
find_prob_nb(target_comment, "Loss")
result = bnb[bnb['Comment'].str.contains(target_comment)]
result

['dame', 'check', 'perpetu', 'lazi', 'apathet', 'downright', 'stupid', 'court', 'it’', 'miser', 'watch', 'miss', 'jrue', 'khri', 'better', 'dame', 'right']
{'dame': 0.4727272727272728, 'check': 0.31034482758620685, 'perpetu': 0.49999999999999994, 'lazi': 0.49999999999999994, 'apathet': nan, 'downright': nan, 'stupid': 0.6086956521739131, 'court': 0.4726027397260274, 'it’': nan, 'miser': 0.5555555555555556, 'watch': 0.541958041958042, 'miss': 0.5688888888888889, 'jrue': 0.58, 'khri': 0.40449438202247184, 'better': 0.44941176470588234, 'right': 0.5503875968992248}


  ratio[key] = value / evidence[key]


Unnamed: 0.1,Unnamed: 0,Comment,Result,Comment_Adj,No_Stop,Stemmed,Result_Bin,Predicted_Result,Predicted_Probability
1282,1282,"Dame is checked out and is perpetually lazy, a...",Loss,dame is checked out and is perpetually lazy ap...,dame checked perpetually lazy apathetic downri...,dame check perpetu lazi apathet downright stup...,0,1,0.995509
