# Text Final Modeling Mapping Scores to Votes

Import libraries

In [209]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from dateutil import parser
import re
import os

Process to learn twitter data:
First, understand there are three groups of voter: 
1. those without twitter or tweet in none english languages 
2. those with english twitter, and used to be a voter
3. those with english twitter, and are new voters this yer

For case 1, nothing we could do, we have to drop these.
For case 3, we will also drop them as them might not provide individual insights for 2020 votes. 
The below process is for case 2. Also keep in mind that there is 1 NBA fan vote that we are not predicting here.



## Data Processing

In [522]:
tweets = pd.read_csv('all_tweets_with_added_columns.csv')

In [237]:
tweets.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,user_id,username,date,time,tweet,text,textblob,nltk_tweet,...,Donovan Mitchell,Kyle Lowry,Damian Lillard,DeMar DeRozan,Isaiah Thomas,John Wall,LaMarcus Aldridge,Paul George,Stephen Curry,Victor Oladipo
0,0,0,35993422,adaimiel,2020-03-08,03:43:38,😳😳,-,0.0,-1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,35993422,adaimiel,2020-03-08,01:19:22,"Con 43, mirando así al defensor: https://twitt...",con mirando as al defensor,0.0,1,...,0,0,0,0,0,0,0,0,0,0
2,2,2,35993422,adaimiel,2020-03-07,06:15:49,Yo lo siento mucho también. Saludos y gracias ...,yo lo siento mucho tambin saludos gracias por ...,0.0,-1,...,0,0,0,0,0,0,0,0,0,0
3,3,3,35993422,adaimiel,2020-03-07,05:55:16,"Para la boca, ¿mano o mascarilla?, mi opinión ...",para la boca mano mascarilla mi opinin de esta...,0.0,-1,...,0,0,0,0,0,0,0,0,0,0
4,4,4,35993422,adaimiel,2020-03-07,03:01:44,"Se cambió posteriormente, por lo que yo sé.",se cambi posteriormente por lo que yo,0.0,-1,...,0,0,0,0,0,0,0,0,0,0


In [216]:
voter_info = pd.read_excel('Voter Twitter Handle List.xlsx')
vote2017 = pd.read_excel('2016-17-Kia-NBA-Most-Valuable-Player-of-the-Year-Award-1.xlsx')
vote2018 = pd.read_excel('2017-18-Kia-NBA-Most-Valuable-Player-Voter-Selections.xlsx')
vote2019 = pd.read_excel('2018-19-NBA-Most-Valuable-Player-Award-Voter-Selections.xlsx')

In [217]:
voter_info.head()

Unnamed: 0,Name,Media,Twitter Handle,Note,AS 2020,"MVP 2019 (Jun 26, 2018 - Jun 23, 2019)","MVP 2018 (Jun 27, 2017 - Jun 24, 2018)","MVP 2017 ( May 11, 2016 - Jun 25, 2017)",IsNew
0,Scott Agness,The Athletic,ScottAgness,,Y,Y,,,N
1,Marv Albert,Turner,,Old. Important,Y,Y,Y,,N
2,Sam Amick,The Athletic,sam_amick,,Y,Y,Y,Y,N
3,Jason Anderson,Sacramento Bee,JandersonSacBee,,Y,Y,,,N
4,Greg Anthony,Turner,GregAnthony50,,Y,Y,Y,,N


In [218]:
#Seperate voter groups in case needed 
#those without twitter
voters_no_twitter = voter_info[voter_info['Twitter Handle'].isna()]
#those who never voted before
voters_new = voter_info[voter_info['IsNew'] == 'Y']
#those who have voted before
voters_train = voter_info[voter_info['IsNew'] == 'N']

Process vote data - standardize the vote names and assign vote years 

In [219]:
#standardize vote names
def standardize_vote_player_name(vote):
    for i in range(2, vote.shape[1]):
        vote.iloc[:,i] = vote.iloc[:,i].str.replace(',', '')
        vote.iloc[:,i] = vote.iloc[:,i].str.split()
        vote.iloc[:,i] = vote.iloc[:,i].apply(lambda x: x[1] + ' ' + x[0])

In [220]:
for df in [vote2017, vote2018, vote2019]:
    standardize_vote_player_name(df)
    df.columns = ['Voter', 'Affiliation', 'top1', 'top2', 'top3', 'top4', 'top5']

In [478]:
#add year
vote2017['year'] = 2017
vote2018['year'] = 2018
vote2019['year'] = 2019

votes = vote2017.append(vote2018, ignore_index = True)
votes = votes.append(vote2019, ignore_index = True)


In [479]:
votes

Unnamed: 0,Voter,Affiliation,top1,top2,top3,top4,top5,year
0,Ailene Voisin,Sacramento Bee,Russell Westbrook,James Harden,Kawhi Leonard,LeBron James,Stephen Curry,2017
1,Andrew Sharp,Sports Illustrated,Russell Westbrook,Kawhi Leonard,James Harden,LeBron James,Stephen Curry,2017
2,Anthony Slater,Bay Area News Group,Russell Westbrook,James Harden,Kawhi Leonard,LeBron James,Stephen Curry,2017
3,Antoni Daimiel,Moviestar+,Russell Westbrook,James Harden,Kawhi Leonard,LeBron James,Isaiah Thomas,2017
4,Ben Golliver,Sports Illustrated,James Harden,Russell Westbrook,Kawhi Leonard,LeBron James,Stephen Curry,2017
...,...,...,...,...,...,...,...,...
298,Ryan Wolstat,Toronto Sun,Giannis Antetokounmpo,James Harden,Paul George,Nikola Jokic,Damian Lillard,2019
299,Eric Woodyard,Deseret News,James Harden,Giannis Antetokounmpo,Paul George,Joel Embiid,Kawhi Leonard,2019
300,Royce Young,ESPN,Giannis Antetokounmpo,James Harden,Damian Lillard,Paul George,Nikola Jokic,2019
301,Weiping Zhang,CCTV,Giannis Antetokounmpo,James Harden,Stephen Curry,Nikola Jokic,Paul George,2019


Add player columns to all votes using the MBA official voting schemes: 1st 10 points, 2nd 7 points, 3rd 5points, 4th 3 points, and 5th 1 point.

In [480]:
#create player names in the vote table
votes_vals = votes.iloc[:, 2:7].values
voted_players =  np.unique(votes_vals)
for name in voted_players: votes[name] = 0

In [481]:
#convert the vote to scores for all votes
def convert_vote_to_score(vote_row):    
    vote_scores = [10, 7, 5, 3, 1]
    for i in range(len(vote_scores)):
        vote_row[vote_row[i+2]] = vote_scores[i]
    return vote_row

votes = votes.apply(lambda row : convert_vote_to_score(row), axis = 1) 

In [482]:
#add 2020 voter twitter handle to voter list
twitter_handles = voters_train[['Name', 'Twitter Handle']]
twitter_handles.columns = ['Voter', 'username']
twitter_handles['username'] = twitter_handles['username'].str.lower()
votes = votes.merge(twitter_handles,how='left',on='Voter')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [483]:
#move year and twitter handle column to front
votes = votes.reindex(columns=['year','username','top1', 'top2', 'top3', 'top4', 'top5','Voter','Affiliation'] + voted_players.tolist())
#drop those without twitter handle - either not voter this year or do not have twitter account
votes = votes[votes['username'].notna()]
votes.reset_index(drop=True, inplace=True)

In [484]:
votes[votes['Voter'] == 'Anthony Chiang']

Unnamed: 0,year,username,top1,top2,top3,top4,top5,Voter,Affiliation,Anthony Davis,...,Kawhi Leonard,Kevin Durant,LaMarcus Aldridge,LeBron James,Nikola Jokic,Paul George,Rudy Gobert,Russell Westbrook,Stephen Curry,Victor Oladipo
111,2019,anthony_chiang,Giannis Antetokounmpo,James Harden,Paul George,Stephen Curry,Nikola Jokic,Anthony Chiang,Miami Herald,0,...,0,0,0,0,1,5,0,0,3,0


## Process the tweets

In [238]:
tweets.shape

(610578, 44)

In [239]:
tweets.drop('Unnamed: 0.1',axis=1, inplace = True)

In [240]:
tweets.columns.values

array(['Unnamed: 0', 'user_id', 'username', 'date', 'time', 'tweet',
       'text', 'textblob', 'nltk_tweet', 'sent140', 'emoticon',
       'James Harden', 'LeBron James', 'Giannis Antetokounmpo',
       'Anthony Davis', 'Luka Doncic', 'Trae Young', 'Nikola Jokic',
       'Russell Westbrook', 'Kawhi Leonard', 'Chris Paul', 'Devin Booker',
       'Khris Middleton', 'Bam Adebayo', 'Jayson Tatum', 'Kemba Walker',
       'Jimmy Butler', 'Ben Simmons', 'Joel Embiid', 'Domantas Sabonis',
       'Brandon Ingram', 'Pascal Siakam', 'Rudy Gobert',
       'Donovan Mitchell', 'Kyle Lowry', 'Damian Lillard',
       'DeMar DeRozan', 'Isaiah Thomas', 'John Wall', 'LaMarcus Aldridge',
       'Paul George', 'Stephen Curry', 'Victor Oladipo'], dtype=object)

In [242]:
#tweets[tweets.sample(n=200).to_csv('tweet_samples_new.csv')
#tweets[tweets.sum(axis=1) > 0]
#tweets[tweets.iloc[:, 12:].sum(axis=1) > 0].sample(n=100).to_csv('tweet_samples_new1.csv')

In [243]:
#process date and time columns
def treat_date(str):
    if '/' in str:
        return (datetime.strptime(str, '%m/%d/%Y').date())
    else:
        return (datetime.strptime(str, '%Y-%m-%d').date())
# tweets.date = tweets.date.apply(parser.parse)
# tweets.time = tweets.time.apply(parser.parse)
tweets['date'] = tweets['date'].apply(lambda x: treat_date(x))
tweets['time'] = tweets['time'].apply(lambda x: datetime.strptime(x, '%H:%M:%S').time())

In [244]:
startday = pd.Series(['2016-05-11', '2017-06-27', '2018-06-26', '2019-06-25'])
endday = pd.Series(['2017-06-25', '2018-06-24', '2019-06-23', '2020-03-09'])
startday = startday.apply(lambda x: treat_date(x))
endday = endday.apply(lambda x: treat_date(x))

In [245]:
#mark the tweets for which voting year it belongs to
tweets['year'] = 0 
for i in range(0, 4):
    tweets.loc[(tweets['date'] >= startday[i]) & (tweets['date'] <= endday[i]),'year'] = 2017 + i
#move year column to front
tweets = tweets.reindex(columns=['year'] + list(tweets.columns[:-1]))
#drop the tweets that do not belong to any year
tweets.drop(tweets[tweets['year']==0].index, axis = 0, inplace = True)
#convert year into str for future processing
tweets['year'] = tweets['year'].apply(lambda x: str(x))

In [246]:
#get tweet count of all voters for each player
dropped_columns = ['Unnamed: 0', 'user_id', 'date', 'time', 'tweet', 'text','textblob', 'nltk_tweet', 'sent140', 'emoticon']
tweet_count = tweets.drop(dropped_columns,axis=1)
tweet_count = tweet_count.groupby(['year', 'username']).sum()
tweet_count = tweet_count.reset_index()

In [247]:
#visual inspection of those who didn't tweet about any player in a year
tweet_count[tweet_count.sum(axis=1) == 0]

Unnamed: 0,year,username,James Harden,LeBron James,Giannis Antetokounmpo,Anthony Davis,Luka Doncic,Trae Young,Nikola Jokic,Russell Westbrook,...,Donovan Mitchell,Kyle Lowry,Damian Lillard,DeMar DeRozan,Isaiah Thomas,John Wall,LaMarcus Aldridge,Paul George,Stephen Curry,Victor Oladipo
14,2017,jcowleyhoops,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21,2017,renjunsd,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23,2017,rodboone,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
179,2020,corallu1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [248]:
#create different df for different scores
dropped_columns = ['Unnamed: 0', 'user_id', 'date', 'time', 'tweet', 'text','nltk_tweet', 'sent140', 'emoticon']
tweet_textblob = tweets.drop(dropped_columns,axis=1)
dropped_columns = ['Unnamed: 0', 'user_id', 'date', 'time', 'tweet', 'text','textblob', 'sent140', 'emoticon']
tweet_nltk = tweets.drop(dropped_columns,axis=1)
dropped_columns = ['Unnamed: 0', 'user_id', 'date', 'time', 'tweet', 'text','textblob', 'nltk_tweet', 'emoticon']
tweet_sent140 = tweets.drop(dropped_columns,axis=1)
dropped_columns = ['Unnamed: 0', 'user_id', 'date', 'time', 'tweet', 'text','textblob', 'nltk_tweet', 'sent140']
tweet_emoticon = tweets.drop(dropped_columns,axis=1)

In [249]:
#get player names
players = tweets.columns[12:]
#calculate scores for each player in each tweet

def sum_voter_tweet_score(df):
    df.columns.values[2] = 'score'
    for name in players:
        df[name]=  df[name] * df['score']
    df = df.groupby(['year', 'username']).sum()
    df = df.reset_index()
    df.drop('score',axis=1, inplace = True)

    return df


tweet_textblob = sum_voter_tweet_score(tweet_textblob)
tweet_nltk = sum_voter_tweet_score(tweet_nltk)
tweet_sent140 = sum_voter_tweet_score(tweet_sent140)
tweet_emoticon = sum_voter_tweet_score(tweet_emoticon)


## Method 1: Mock up voter votes by finding top 5 highest sentiment players

In [424]:
tweet_nltk.tail()

Unnamed: 0,year,username,top1,top2,top3,top4,top5,James Harden,LeBron James,Giannis Antetokounmpo,...,Donovan Mitchell,Kyle Lowry,Damian Lillard,DeMar DeRozan,Isaiah Thomas,John Wall,LaMarcus Aldridge,Paul George,Stephen Curry,Victor Oladipo
249,2020,willguillory,Brandon Ingram,LeBron James,Kemba Walker,DeMar DeRozan,Paul George,2,14,-2,...,-2,2,-7,7,-1,-3,0,6,2,0
250,2020,windhorstespn,Anthony Davis,Trae Young,Russell Westbrook,Jayson Tatum,LeBron James,1,3,-1,...,2,0,1,0,-1,0,0,2,0,0
251,2020,yokomiyajie,Joel Embiid,James Harden,Kemba Walker,Kawhi Leonard,Trae Young,1,1,1,...,0,0,0,0,0,0,0,0,0,0
252,2020,yourmandevine,Anthony Davis,Trae Young,Giannis Antetokounmpo,Joel Embiid,Chris Paul,-8,2,14,...,1,0,1,-2,-2,1,-1,3,2,1
253,2020,zachlowe_nba,Anthony Davis,Ben Simmons,LeBron James,Luka Doncic,Bam Adebayo,3,8,1,...,3,2,6,1,0,1,0,3,2,0


How to really get the top 5 for a voter: 
1. filter out those players with tweet count >0 
2. look at only these players, choose the top 5
3. if there are less than 5 players being tweeted, leave the rest as NA 


In [251]:
#get the top 5 twitter scores for all voter tweets by year

def get_top_5(tweet_score_df):    
    df = tweet_score_df.iloc[:, 2:]
    nlargest = len(df.columns)
    order = np.argsort(-df.values, axis=1)[:, :nlargest]
    result = pd.DataFrame(df.columns[order], 
                          columns=['top{}'.format(i) for i in range(1, nlargest+1)],
                          index=df.index)

    for i in range(len(result)):
        players = result.iloc[i,:].tolist()
        #check whether player has been tweeted about or not, if not, do not include
        for player in result.iloc[i,:].tolist():
            if tweet_count.loc[i, player] == 0: 
                players.remove(player)
        for j in range(5):
            if j < len(players): 
                result.loc[i,'top{}'.format(j+1)] = players[j]
            else:
                result.loc[i,'top{}'.format(j+1)] = np.nan
    result = result.iloc[:, :5]                 
                
    df = tweet_score_df.join(result)
    df = df.reindex(columns = list(df.columns[:2]) + list(df.columns[-5:]) + list(df.columns[2:-5]))
    return df
    
tweet_textblob = get_top_5(tweet_textblob)
tweet_nltk = get_top_5(tweet_nltk)
tweet_sent140 = get_top_5(tweet_sent140)
tweet_emoticon = get_top_5(tweet_emoticon)


  import sys


In [252]:
tweet_textblob[tweet_textblob['username'] == 'anthony_chiang']

Unnamed: 0,year,username,top1,top2,top3,top4,top5,James Harden,LeBron James,Giannis Antetokounmpo,...,Donovan Mitchell,Kyle Lowry,Damian Lillard,DeMar DeRozan,Isaiah Thomas,John Wall,LaMarcus Aldridge,Paul George,Stephen Curry,Victor Oladipo
1,2017,anthony_chiang,Anthony Davis,LeBron James,Trae Young,Paul George,Chris Paul,1.822222,18.445433,0.366667,...,0.453846,0.5,0.0,0.324386,-0.551786,-0.2,0.0,3.453207,0.454167,0.0
38,2018,anthony_chiang,Anthony Davis,Bam Adebayo,LeBron James,Trae Young,Ben Simmons,-0.288194,13.873058,0.525,...,-0.158333,0.954167,-1.033333,0.425,0.0,-0.025,0.5,0.538371,1.545833,0.0
78,2019,anthony_chiang,Anthony Davis,Bam Adebayo,Jimmy Butler,LeBron James,Trae Young,-0.29,3.128742,0.41,...,0.334394,-0.4,-0.15,-0.4,1.278788,-0.301136,0.0,0.612969,0.55,-0.4
162,2020,anthony_chiang,Anthony Davis,Jimmy Butler,Bam Adebayo,LeBron James,Kawhi Leonard,-0.080556,9.100397,2.24928,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,-2.910227,0.0,-0.059821


In [253]:
votes[votes['username'] == 'anthony_chiang']

Unnamed: 0,year,username,top1,top2,top3,top4,top5,Voter,Affiliation,Anthony Davis,...,Kawhi Leonard,Kevin Durant,LaMarcus Aldridge,LeBron James,Nikola Jokic,Paul George,Rudy Gobert,Russell Westbrook,Stephen Curry,Victor Oladipo
111,2019,anthony_chiang,Giannis Antetokounmpo,James Harden,Paul George,Stephen Curry,Nikola Jokic,Anthony Chiang,Miami Herald,0,...,0,0,0,0,1,5,0,0,3,0


In [262]:
def check_twitter_votes_match ( tweet_score_df ):
    df = votes.iloc[:,:7]
    df.iloc[:,2:7] = 0
    df['top5_match'] = 0
    df['top3_match'] = 0
    for i in range(len(df)):
        yr = str(df.loc[i, 'year'])
        name = df.loc[i, 'username']
        for j in range(1,6):
            top = 'top{}'.format(j)
            vtop = votes.loc[i, top]
            ttop = tweet_score_df.loc[(tweet_score_df['year'] == yr) & (tweet_score_df['username'] == name)][top].values
            if vtop == ttop: df.loc[i, top] = 1
        vtops = votes.iloc[156, 2:7].values.tolist()
        ttops = tweet_score_df.loc[(tweet_score_df['year'] == yr) & (tweet_score_df['username'] == name)].iloc[:,2:7].values.tolist()
        if ttops != []: 
            df.loc[i,'top5_match'] = len(set(vtops) & set(ttops[0]))
            df.loc[i,'top3_match'] = len(set(vtops[:3]) & set(ttops[0][:3]))
    df['top5_ranked_match'] = df['top1'] + df['top2'] + df['top3'] + df['top4'] + df['top5']
    df['top3_ranked_match'] = df['top1'] + df['top2'] + df['top3']
    return df
    
match_textblob = check_twitter_votes_match(tweet_textblob)
match_nltk = check_twitter_votes_match(tweet_nltk)
match_sent140 = check_twitter_votes_match(tweet_sent140)
match_emoticon = check_twitter_votes_match(tweet_emoticon)

  del sys.path[0]


In [267]:
match_nltk

Unnamed: 0,year,username,top1,top2,top3,top4,top5,top5_match,top3_match,top5_ranked_match,top3_ranked_match
0,2017,billsimmons,0,0,0,0,0,0,0,0,0
1,2017,briancmahoney,0,0,0,0,0,1,0,0,0
2,2017,windhorstespn,0,0,0,0,0,0,0,0,0
3,2017,chris_broussard,0,0,0,0,0,1,0,0,0
4,2017,dennis3dscott,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
155,2019,realmikewilbon,0,0,0,0,0,2,0,0,0
156,2019,windhorstespn,0,0,0,0,0,1,0,0,0
157,2019,mattwinertv,0,0,0,0,1,2,0,1,0
158,2019,royceyoung,1,0,0,0,0,3,1,1,1


In [266]:
for match in [match_textblob, match_nltk, match_sent140, match_emoticon]:
    print('Total top5 match:', match['top5_match'].sum(), 'Total top5 ranked match:', match['top5_ranked_match'].sum())
    print('Total top3 match:', match['top3_match'].sum(), 'Total top3 ranked match:', match['top3_ranked_match'].sum())

Total top5 match: 115 Total top5 ranked match: 18
Total top3 match: 30 Total top3 ranked match: 13
Total top5 match: 147 Total top5 ranked match: 16
Total top3 match: 36 Total top3 ranked match: 11
Total top5 match: 96 Total top5 ranked match: 16
Total top3 match: 35 Total top3 ranked match: 9
Total top5 match: 61 Total top5 ranked match: 13
Total top3 match: 34 Total top3 ranked match: 7


### Method 1 match accuracy

In [273]:

n = len(match_textblob)
for match in [match_textblob, match_nltk, match_sent140, match_emoticon]:
    print('Average top5 match:', match['top5_match'].sum()/(n*5), 'Average top5 ranked match:', match['top5_ranked_match'].sum()/(n*5))
    print('Average top3 match:', match['top3_match'].sum()/(n*3), 'Average top3 ranked match:', match['top3_ranked_match'].sum()/(n*3))


Average top5 match: 0.14375 Average top5 ranked match: 0.0225
Average top3 match: 0.0625 Average top3 ranked match: 0.027083333333333334
Average top5 match: 0.18375 Average top5 ranked match: 0.02
Average top3 match: 0.075 Average top3 ranked match: 0.022916666666666665
Average top5 match: 0.12 Average top5 ranked match: 0.02
Average top3 match: 0.07291666666666667 Average top3 ranked match: 0.01875
Average top5 match: 0.07625 Average top5 ranked match: 0.01625
Average top3 match: 0.07083333333333333 Average top3 ranked match: 0.014583333333333334


In [499]:
for match in [match_textblob, match_nltk, match_sent140, match_emoticon]:
    print( match['top5_match'].sum(), match['top5_ranked_match'].sum(), match['top3_match'].sum(),match['top3_ranked_match'].sum())

115 18 30 13
147 16 36 11
96 16 35 9
61 13 34 7


In [500]:
print(n)

160


### Check the same using tweet count instead of sentiment score

In [269]:
#get the top 5 twitter scores for all voter tweets by year

tweet_count = get_top_5(tweet_count)



  import sys


In [270]:
tweet_count.head()

Unnamed: 0,year,username,top1,top2,top3,top4,top5,James Harden,LeBron James,Giannis Antetokounmpo,...,Donovan Mitchell,Kyle Lowry,Damian Lillard,DeMar DeRozan,Isaiah Thomas,John Wall,LaMarcus Aldridge,Paul George,Stephen Curry,Victor Oladipo
0,2017,adamhimmelsbach,Anthony Davis,Isaiah Thomas,LeBron James,Trae Young,Chris Paul,20,120,7,...,1,10,6,15,718,43,2,29,19,0
1,2017,anthony_chiang,Anthony Davis,LeBron James,Paul George,Bam Adebayo,DeMar DeRozan,16,197,2,...,11,1,0,23,22,2,2,39,10,0
2,2017,bdawsonwrites,Anthony Davis,Russell Westbrook,Victor Oladipo,Donovan Mitchell,Domantas Sabonis,51,37,2,...,134,1,18,10,5,11,0,34,36,237
3,2017,briancmahoney,Anthony Davis,LeBron James,Russell Westbrook,Stephen Curry,James Harden,30,83,8,...,0,10,4,16,12,4,3,21,44,1
4,2017,chris_broussard,Anthony Davis,LeBron James,Stephen Curry,Russell Westbrook,Paul George,12,70,2,...,0,1,5,3,7,6,2,25,33,0


In [271]:
match_count = check_twitter_votes_match(tweet_count)

  del sys.path[0]


In [272]:
match_count

Unnamed: 0,year,username,top1,top2,top3,top4,top5,top5_match,top3_match,top5_ranked_match,top3_ranked_match
0,2017,billsimmons,0,0,0,0,0,0,0,0,0
1,2017,briancmahoney,0,0,0,0,0,2,0,0,0
2,2017,windhorstespn,0,0,0,0,0,0,0,0,0
3,2017,chris_broussard,0,0,0,0,0,2,0,0,0
4,2017,dennis3dscott,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
155,2019,realmikewilbon,0,0,0,0,0,1,0,0,0
156,2019,windhorstespn,0,0,0,0,0,1,1,0,0
157,2019,mattwinertv,0,0,1,0,0,2,0,1,1
158,2019,royceyoung,0,0,0,0,0,1,1,0,0


In [277]:
print('Total top5 match:', match_count['top5_match'].sum(), 'Total top5 ranked match:', match_count['top5_ranked_match'].sum())
print('Total top3 match:', match_count['top3_match'].sum(), 'Total top3 ranked match:', match_count['top3_ranked_match'].sum())
print('Average top5 match:', match_count['top5_match'].sum()/(n*5), 'Average top5 ranked match:', match_count['top5_ranked_match'].sum()/(n*5))
print('Average top3 match:', match_count['top3_match'].sum()/(n*3), 'Average top3 ranked match:', match_count['top3_ranked_match'].sum()/(n*3))

Total top5 match: 128 Total top5 ranked match: 30
Total top3 match: 23 Total top3 ranked match: 19
Average top5 match: 0.16 Average top5 ranked match: 0.0375
Average top3 match: 0.04791666666666667 Average top3 ranked match: 0.03958333333333333


## Method 2: Observe top player scores

Try to get the top five of each year, see if the rankings make better sense

In [None]:
find top 5 players each year
observe how their sentiment is among all people 
also add count of tweets 

In [367]:
top_mvp = pd.DataFrame(columns = ['year','top1', 'top2', 'top3', 'top4', 'top5'])

In [368]:
for y in [2017, 2018, 2019]:
    votes_y = votes[votes['year']==y].sum()
    votes_y = votes_y[9:].sort_values(ascending=False)[:5].index.tolist()
    top_mvp = top_mvp.append(pd.DataFrame([[y]+votes_y], columns=top_mvp.columns))
top_mvp = top_mvp.reset_index(drop = True)

Unnamed: 0,year,top1,top2,top3,top4,top5
0,2017,Russell Westbrook,James Harden,Kawhi Leonard,LeBron James,Isaiah Thomas
1,2018,James Harden,LeBron James,Anthony Davis,Damian Lillard,Russell Westbrook
2,2019,Giannis Antetokounmpo,James Harden,Paul George,Nikola Jokic,Stephen Curry


In [398]:
tweet_emoticon_avg = 

'Russell Westbrook'

In [503]:
top_mvp_score = top_mvp.copy()

for eval_df in [tweet_textblob, tweet_nltk, tweet_sent140, tweet_emoticon,tweet_count]:
    for y in [2017, 2018, 2019]:
        score_y = eval_df[eval_df['year']==str(y)].sum()
        for top in ['top1', 'top2', 'top3', 'top4', 'top5']:
            top_mvp_score.loc[y-2017, top] = score_y[top_mvp.loc[y-2017, top]]        
    print(top_mvp_score)

   year     top1     top2     top3     top4     top5
0  2017  146.666  49.9472  42.2162  299.232  87.2317
1  2018   48.986  309.436  945.038  21.9008  131.917
2  2019    150.6    90.65  213.305  5.51629  118.622
   year  top1 top2   top3 top4  top5
0  2017  -419   18    -55  194  -245
1  2018    51  217  -2898  -17  -669
2  2019  -200   78   -920   17   263
   year     top1     top2     top3     top4     top5
0  2017 -394.912 -140.496 -67.8914  -743.23 -36.5382
1  2018 -136.149 -981.155 -5830.72 -60.5542 -329.145
2  2019 -414.911 -296.292 -34.5347 -7.59053 -252.344
   year     top1     top2     top3      top4     top5
0  2017 -608.657  -206.28 -32.1759  -1108.44  139.861
1  2018 -203.278 -1242.13 -3604.35 -0.667528 -573.707
2  2019 -623.591 -423.238 -172.968  -11.8184 -701.002
   year  top1  top2   top3  top4  top5
0  2017  2203   722    525  3526  1323
1  2018   663  3891  23044   361  2475
2  2019  2158  1308   3226    43  1691


In [413]:
top_mvp_count = top_mvp.copy()
for y in [2017, 2018, 2019]:
    count_y = tweet_count[tweet_count['year']==str(y)].sum()
    for top in ['top1', 'top2', 'top3', 'top4', 'top5']:
        top_mvp_count.loc[y-2017, top] = count_y[top_mvp.loc[y-2017, top]]        
top_mvp_count  

Unnamed: 0,year,top1,top2,top3,top4,top5
0,2017,2203,722,525,3526,1323
1,2018,663,3891,23044,361,2475
2,2019,2158,1308,3226,43,1691


## Generate Prediction 

In [492]:
final = tweet_nltk[tweet_nltk['year']=='2020'].reset_index(drop = True)

In [493]:
final = final.iloc[:, :7]

In [494]:
votes_vals = final.iloc[:, 2:8].fillna('')
votes_vals = votes_vals.values
voted_players = np.delete(np.unique(votes_vals), 0)
for name in voted_players: final[name] = 0
final = final.apply(lambda row : convert_vote_to_score(row), axis = 1) 
final = final.reindex(columns=['year','username','top1', 'top2', 'top3', 'top4', 'top5'] + voted_players.tolist())

In [495]:
final

Unnamed: 0,year,username,top1,top2,top3,top4,top5,Anthony Davis,Bam Adebayo,Ben Simmons,...,LaMarcus Aldridge,LeBron James,Luka Doncic,Pascal Siakam,Paul George,Rudy Gobert,Russell Westbrook,Stephen Curry,Trae Young,Victor Oladipo
0,2020,adaimiel,LeBron James,Paul George,Damian Lillard,Rudy Gobert,Stephen Curry,0,0,0,...,0,10,0,0,7,3,0,1,0,0
1,2020,adamhimmelsbach,Kemba Walker,Jayson Tatum,Trae Young,Jimmy Butler,Giannis Antetokounmpo,0,0,0,...,0,0,0,0,0,0,0,0,5,0
2,2020,andyblarsen,Joel Embiid,Giannis Antetokounmpo,Kawhi Leonard,Isaiah Thomas,LeBron James,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,2020,anthony_chiang,Bam Adebayo,Anthony Davis,Jimmy Butler,LeBron James,Giannis Antetokounmpo,7,10,0,...,0,3,0,0,0,0,0,0,0,0
4,2020,aschnba,Anthony Davis,LeBron James,Joel Embiid,Russell Westbrook,Khris Middleton,10,0,0,...,0,7,0,0,0,0,3,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,2020,willguillory,Brandon Ingram,LeBron James,Kemba Walker,DeMar DeRozan,Paul George,0,0,0,...,0,7,0,0,1,0,0,0,0,0
91,2020,windhorstespn,Anthony Davis,Trae Young,Russell Westbrook,Jayson Tatum,LeBron James,10,0,0,...,0,1,0,0,0,0,5,0,7,0
92,2020,yokomiyajie,Joel Embiid,James Harden,Kemba Walker,Kawhi Leonard,Trae Young,0,0,0,...,0,0,0,0,0,0,0,0,1,0
93,2020,yourmandevine,Anthony Davis,Trae Young,Giannis Antetokounmpo,Joel Embiid,Chris Paul,10,0,0,...,0,0,0,0,0,0,0,0,7,0


In [497]:
votes_pred = final.sum()
votes_pred = votes_pred[2:].sort_values(ascending=False)[:5].index.tolist()
print(votes_pred)

['Anthony Davis', 'LeBron James', 'Trae Young', 'Paul George', 'James Harden']
