In [1]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Imports

In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import gc
import re

from tqdm import tqdm
tqdm.pandas()

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import scale, minmax_scale
from scipy.stats import norm

import lightgbm as lgb

import warnings
warnings.simplefilter(action='ignore')
warnings.filterwarnings('ignore')

# change to path
PATH='/content/drive/My Drive/Colab Notebooks/nba/'
os.chdir(PATH)

## Read in

In [3]:
train = pd.read_csv(f'{PATH}/dataset/training_set.csv', encoding = 'ISO-8859-1')
test = pd.read_csv(f'{PATH}/dataset/holdout_set.csv', encoding = 'ISO-8859-1')

train.head()

Unnamed: 0,Engagements,Followers at Posting,Created,Type,Description
0,502093,36984682,2019-05-21 23:30:51 EDT,Video,The @raptors bench trio of @sergeibaka @norman...
1,603380,36984682,2019-05-21 22:53:33 EDT,Video,@kyle_lowry7 pulls from deep for the @raptors ...
2,603380,36984682,2019-05-21 22:19:58 EDT,Video,@k_mid22 with some english on the @bucks dime!
3,725100,36984682,2019-05-21 22:02:41 EDT,Video,Kawhi punches it home with the left on TNT!
4,661446,36984682,2019-05-21 20:47:49 EDT,Video,@giannis_an34 goes baseline early to rock the ...


In [4]:
len(train), len(test)

(7766, 1000)

## Preprocessing

In [0]:
def str_to_date(s, split):
    return s.split(' ')[split]
  
def add_datepart(df, fldname, drop=True, time=False):
    "Helper function that adds columns relevant to a date."
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)
      
def get_time_split(time):
    time_minute = time.split(':')[:2]
    return int(''.join(time_minute))
  
def get_hour(time):
    return int(time.split(':')[0])
  
def get_minute(time):
    return round(int(time.split(':')[1]), -1)

In [0]:
def preprocess(df):
    df['date'] = df.apply(lambda x: str_to_date(x['Created'], 0), axis=1)
    df['time'] = df.apply(lambda x: str_to_date(x['Created'], 1), axis=1)
    df['tz'] = df.apply(lambda x: str_to_date(x['Created'], 2), axis=1)
    
    # get more features on date
    df['date'] = pd.to_datetime(df['date'])
    
    # date
    df['date_tsfm'] = pd.to_datetime(df['date'])
    
    add_datepart(df, 'date_tsfm')

    time_df = pd.get_dummies(df['Type'], 
                         prefix='Type')

    # get only time and hour
    df['time_split'] = df['time'].apply(get_time_split)
    df['hour'] = df['time'].apply(get_hour)
    df['minute'] = df['time'].apply(get_minute)   
    
    df = pd.concat([df, time_df],
                    axis=1)
    
    return df

In [0]:
train = preprocess(train)
test = preprocess(test)

In [8]:
train.shape, test.shape

((7766, 27), (1000, 27))

In [9]:
train.head()

Unnamed: 0,Engagements,Followers at Posting,Created,Type,Description,date,time,tz,date_tsfmYear,date_tsfmMonth,date_tsfmWeek,date_tsfmDay,date_tsfmDayofweek,date_tsfmDayofyear,date_tsfmIs_month_end,date_tsfmIs_month_start,date_tsfmIs_quarter_end,date_tsfmIs_quarter_start,date_tsfmIs_year_end,date_tsfmIs_year_start,date_tsfmElapsed,time_split,hour,minute,Type_Album,Type_Photo,Type_Video
0,502093,36984682,2019-05-21 23:30:51 EDT,Video,The @raptors bench trio of @sergeibaka @norman...,2019-05-21,23:30:51,EDT,2019,5,21,21,1,141,False,False,False,False,False,False,1558396800,2330,23,30,0,0,1
1,603380,36984682,2019-05-21 22:53:33 EDT,Video,@kyle_lowry7 pulls from deep for the @raptors ...,2019-05-21,22:53:33,EDT,2019,5,21,21,1,141,False,False,False,False,False,False,1558396800,2253,22,50,0,0,1
2,603380,36984682,2019-05-21 22:19:58 EDT,Video,@k_mid22 with some english on the @bucks dime!,2019-05-21,22:19:58,EDT,2019,5,21,21,1,141,False,False,False,False,False,False,1558396800,2219,22,20,0,0,1
3,725100,36984682,2019-05-21 22:02:41 EDT,Video,Kawhi punches it home with the left on TNT!,2019-05-21,22:02:41,EDT,2019,5,21,21,1,141,False,False,False,False,False,False,1558396800,2202,22,0,0,0,1
4,661446,36984682,2019-05-21 20:47:49 EDT,Video,@giannis_an34 goes baseline early to rock the ...,2019-05-21,20:47:49,EDT,2019,5,21,21,1,141,False,False,False,False,False,False,1558396800,2047,20,50,0,0,1


## Text features

In [11]:
ex = train.iloc[6]['Description']
ex

'The @warriors locked in for four games to advance to the #NBAFinals presented by YouTube TV! #GatoradeZero'

In [31]:
def count_hashes(s):
    try:
      return s.count('#')
    except:
      return 0
  
def count_at_signs(s):
    try:
      return s.count('@')
    except:
      return 0
  
def count_punct(s):
    count = 0
    punct = '?!.,"$%\'()*+-/:;<=>[\\]^_`{|}~' + '“”’'    
    for p in punct:
        if isinstance(s, str) and p in s:
          count += 1    
    return count
  
def count_tokens(s):
    try:
      return len(s.split())
    except:
      return 0
    
count_hashes(ex), count_at_signs(ex), count_punct(ex), count_len(ex)  

(2, 1, 1, 17)

In [0]:
def text_features(df):
    df['num_hashes'] = df['Description'].apply(count_hashes)
    df['num_at_signs'] = df['Description'].apply(count_at_signs)
    df['num_punct'] = df['Description'].apply(count_punct)
    df['num_words'] = df['Description'].apply(count_tokens)
    
text_features(train)    

In [35]:
train.head()

Unnamed: 0,Engagements,Followers at Posting,Created,Type,Description,date,time,tz,date_tsfmYear,date_tsfmMonth,date_tsfmWeek,date_tsfmDay,date_tsfmDayofweek,date_tsfmDayofyear,date_tsfmIs_month_end,date_tsfmIs_month_start,date_tsfmIs_quarter_end,date_tsfmIs_quarter_start,date_tsfmIs_year_end,date_tsfmIs_year_start,date_tsfmElapsed,time_split,hour,minute,Type_Album,Type_Photo,Type_Video,num_hashes,num_at_signs,num_punct,num_words
0,502093,36984682,2019-05-21 23:30:51 EDT,Video,The @raptors bench trio of @sergeibaka @norman...,2019-05-21,23:30:51,EDT,2019,5,21,21,1,141,False,False,False,False,False,False,1558396800,2330,23,30,0,0,1,0,4,1,15
1,603380,36984682,2019-05-21 22:53:33 EDT,Video,@kyle_lowry7 pulls from deep for the @raptors ...,2019-05-21,22:53:33,EDT,2019,5,21,21,1,141,False,False,False,False,False,False,1558396800,2253,22,50,0,0,1,0,2,2,12
2,603380,36984682,2019-05-21 22:19:58 EDT,Video,@k_mid22 with some english on the @bucks dime!,2019-05-21,22:19:58,EDT,2019,5,21,21,1,141,False,False,False,False,False,False,1558396800,2219,22,20,0,0,1,0,2,2,8
3,725100,36984682,2019-05-21 22:02:41 EDT,Video,Kawhi punches it home with the left on TNT!,2019-05-21,22:02:41,EDT,2019,5,21,21,1,141,False,False,False,False,False,False,1558396800,2202,22,0,0,0,1,0,0,1,9
4,661446,36984682,2019-05-21 20:47:49 EDT,Video,@giannis_an34 goes baseline early to rock the ...,2019-05-21,20:47:49,EDT,2019,5,21,21,1,141,False,False,False,False,False,False,1558396800,2047,20,50,0,0,1,0,1,2,10


## Ideas

*   Tokenize text to get NBA team's Instagram. Perform `OneHotEncoding` on names.
*   Tokenize text to get players's Instagram. Perform `OneHotEncoding` on names. Mask names with `[TEAM]_player`.
*   Time, i.e., before or after game, playoffs or regular.
*   Understand how is `Engagement` generated.


In [0]:
get_set = False

In [0]:
def get_at_sign(s):
    try:
      ls =  re.findall(r"@(\w+)", s)
      if get_set: return ls
      else: return '|'.join(ls)
    except Exception as e:
      return ''
  
def get_hash(s):
    try:
      ls = re.findall(r"#(\w+)", s)
      if get_set: return ls
      else: return '|'.join(ls)
    except Exception as e:
      return ''

In [38]:
%%time
train['hashes'] = train['Description'].apply(get_hash)
train['at_signs'] = train['Description'].apply(get_at_sign)

CPU times: user 26.9 ms, sys: 2.11 ms, total: 29 ms
Wall time: 31.4 ms


In [39]:
train.head()

Unnamed: 0,Engagements,Followers at Posting,Created,Type,Description,date,time,tz,date_tsfmYear,date_tsfmMonth,date_tsfmWeek,date_tsfmDay,date_tsfmDayofweek,date_tsfmDayofyear,date_tsfmIs_month_end,date_tsfmIs_month_start,date_tsfmIs_quarter_end,date_tsfmIs_quarter_start,date_tsfmIs_year_end,date_tsfmIs_year_start,date_tsfmElapsed,time_split,hour,minute,Type_Album,Type_Photo,Type_Video,num_hashes,num_at_signs,num_punct,num_words,hashes,at_signs
0,502093,36984682,2019-05-21 23:30:51 EDT,Video,The @raptors bench trio of @sergeibaka @norman...,2019-05-21,23:30:51,EDT,2019,5,21,21,1,141,False,False,False,False,False,False,1558396800,2330,23,30,0,0,1,0,4,1,15,,raptors|sergeibaka|normanpowell4|fredvanvleet
1,603380,36984682,2019-05-21 22:53:33 EDT,Video,@kyle_lowry7 pulls from deep for the @raptors ...,2019-05-21,22:53:33,EDT,2019,5,21,21,1,141,False,False,False,False,False,False,1558396800,2253,22,50,0,0,1,0,2,2,12,,kyle_lowry7|raptors
2,603380,36984682,2019-05-21 22:19:58 EDT,Video,@k_mid22 with some english on the @bucks dime!,2019-05-21,22:19:58,EDT,2019,5,21,21,1,141,False,False,False,False,False,False,1558396800,2219,22,20,0,0,1,0,2,2,8,,k_mid22|bucks
3,725100,36984682,2019-05-21 22:02:41 EDT,Video,Kawhi punches it home with the left on TNT!,2019-05-21,22:02:41,EDT,2019,5,21,21,1,141,False,False,False,False,False,False,1558396800,2202,22,0,0,0,1,0,0,1,9,,
4,661446,36984682,2019-05-21 20:47:49 EDT,Video,@giannis_an34 goes baseline early to rock the ...,2019-05-21,20:47:49,EDT,2019,5,21,21,1,141,False,False,False,False,False,False,1558396800,2047,20,50,0,0,1,0,1,2,10,,giannis_an34


In [0]:
# at_signs_ls = train['at_signs'].tolist()
# at_signs_set = set()

# for doc in tqdm(at_signs_ls):
#     for token in doc:
#         at_signs_set.add(token)
        
# hashes_ls = train['hashes'].tolist()
# hashes_set = set()

# for doc in tqdm(hashes_ls):
#     for token in doc:
#         hashes_set.add(token)

# len(at_signs_set), len(hashes_set)

In [0]:
# print('len of unique hashes before removing:', hashes_mat.shape[1])
# print('len of unique hashes after removing:', \
#       len(np.where(np.sum(hashes_mat.values, axis=0) > 3)[0]))
# print('len of unique at_signs before removing:', at_signs_mat.shape[1])
# print('len of unique at_signs after removing:', \
#       len(np.where(np.sum(at_signs_mat.values, axis=0) > 10)[0]))

For all the `hashes` and `at_signs`, remove those that appear 3 times or fewer for `hashes` and 10 times or fewer for `at_signs`. This approach reduces the number of `hashes` from 384 to 133 and the number of `at_signs` from 848 to 159.

In [0]:
# hashes_ls = list(hashes_set)
# idx_keep = list(np.where(np.sum(hashes_mat.values, axis=0) > 3)[0])
# hashes_keep = [hashes_ls[i] for i in idx_keep]

# at_signs_ls = list(at_signs_set)
# idx_keep = list(np.where(np.sum(at_signs_mat.values, axis=0) > 10)[0])
# at_signs_keep = [at_signs_ls[i] for i in idx_keep]

In [0]:
# len(at_signs_keep)

# vals = [''] * len(at_signs_keep)

# # vals

# at_signs_dic = dict(zip(at_signs_keep, vals)) 

In [0]:
# codes = {
#     PHI	76ers
#     MIL	Bucks
#     CHI	Bulls
#     CLE	Cavaliers
#     BOS	Celtics
#     LAC	Clippers
#     MEM	Grizzlies
#     ATL	Hawks
#     MIA	Heat
#     CHA	Hornets
#     UTA	Jazz
#     SAC	Kings
#     NYK	Knicks
#     LAL	Lakers
#     ORL	Magic
#     DAL	Mavericks
#     BKN	Nets
#     DEN	Nuggets
#     IND	Pacers
#     NOP	Pelicans
#     DET	Pistons
#     TOR	Raptors
#     HOU	Rockets
#     SAS	Spurs
#     PHX	Suns
#     OKC	Thunder
#     MIN	Timberwolves
#     POR	Trailblazers
#     GSW	Warriors
#     WAS	Wizards
# }

In [0]:
# at_signs_dic = {
#      '22wiggins': 'MIN',
#      '24baze': 'ATL',
#      '50mejri': 'DAL',
#      'BlakeGriffin23': 'DET',
#      'Cavs': 'CLE',
#      'ESPN': 'ESPN',
#      'GRDrive': 'DET',
# #      'Google': '',
# #      'MGMResortsIntl': '',
#      'ManuGinobili': 'SAS',
#      'MinnesotaLynx': 'MIN',
#      'NBA2KLeague': 'NBA2KLeague',
# #      'Ninja': '',
#      'OKCBlue': 'OKC',
#      'Raptors': 'TOR',
#      'Raptors905': 'TOR',
#      'SLCStars': 'UTA',
#      'StephenCurry30': 'GSW',
#      'SteveNash': 'PHX',
#      'WNBA': 'NBA',
#      'Warriors': 'GSW',
#      'YgTrece': '',
#      '_alvo_': '',
#      '_tonyparker09': '',
#      'aarontaos': '',
#      'adr13nsanmiguel': '',
#      'alhorford': '',
#      'allie14quigs': '',
#      'andredrummondd': '',
#      'anthonyanderson': '',
#      'aplayersprogram': '',
#      'attcenter': '',
#      'austinjmills': '',
#      'badbunnypr': '',
#      'beatsbydre': '',
#      'bronzewhale': '',
#      'btyphoto': '',
#      'bucks': '',
#      'caldwellpope': '',
#      'cavs': '',
#      'cbrickley603': '',
#      'celtics': '',
#      'cgray209': '',
#      'chicagosky': '',
#      'chiefhasarrived': '',
#      'collinyoungbull': '',
#      'coryjoseph': '',
#      'cp3basketballacademy': '',
#      'dallasmavs': '',
#      'darrenrovell': '',
#      'davidluiz_4': '',
#      'deanthony_melton': '',
#      'dequanmjones': '',
#      'dillonbrooks24': '',
#      'djkhaled': '',
#      'dk2house': '',
#      'dmillerky': '',
#      'docrivers': '',
#      'doedoe_10': '',
#      'dominiquewilkins21': '',
#      'dreamville': '',
#      'dulenader2': '',
#      'enikonhart': '',
#      'evanfournier10': '',
#      'frank_ntilikina': '',
#      'fredvanvleet': '',
#      'gabunion': '',
#      'giannis_an34': '',
#      'gorguidieng': '',
#      'greenranger14': '',
#      'houstonrockets': '',
#      'iam_jamesjohnson': '',
#      'iamjamiefoxx': '',
#      'impjrose': '',
#      'ipjh55': '',
#      'isaiahthomas': '',
#      'jasonwilliams55': '',
#      'jbell': '',
#      'jcamerato': '',
#      'jcollins20_': '',
#      'jimmybutler': '',
#      'jjredick': '',
#      'kareemabduljabbar_33': '',
#      'katielou33': '',
#      'kennethfaried35': '',
#      'kenny': '',
#      'kennysmith': '',
#      'kknox_23': '',
#      'kkorv26': '',
#      'kobebryant': '',
#      'la_sparks': '',
#      'landryshamet': '',
#      'liluzivert': '',
#      'lonniewalkeriv': '',
#      'lsubasketball': '',
#      'mac11': '',
#      'mache275': '',
#      'magicjohnson': '',
#      'malcolmbrogdon': '',
#      'matrix31': '',
#      'matthewdelly': '',
#      'maximilian': '',
#      'melvinjrr': '',
#      'mentornmp': '',
#      'meyersleonard11': '',
#      'michaelbjordan': '',
#      'michaeldapaah_': '',
#      'mike_2reckless': '',
#      'milosteodosic4': '',
#      'moe_harkless': '',
#      'moosemonroe15': '',
#      'moritz_weasley': '',
#      'mpj': '',
#      'nbaacademy': '',
#      'nbacoaches': '',
#      'neweracap': '',
#      'neymarjr': '',
#      'nmaahc': '',
#      'northcoastbluechips': '',
#      'nuggets': '',
#      'officialzhouqi': '',
#      'okarowhite': '',
#      'ottodayporter22': '',
#      'panicatthedisco': '',
#      'paugasol': '',
#      'paulmillsap4': '',
#      'phoenixmercury': '',
#      'pierretpelican': '',
#      'pskills43': '',
#      'qcook323': '',
#      'quavohuncho': '',
#      'quese': '',
#      'reemix05': '',
#      'rmfycharlotte': '',
#      'ruffles': '',
#      'sap': '',
#      'sergeibaka': '',
#      'shabazznap13r': '',
#      'shanelarkin_0': '',
#      'stuffmagic': '',
#      'sylvia_fowles': '',
#      'taureanprince': '',
#      'taylorbennett': '',
#      'teddysphotos': '',
#      'tharealjsimms': '',
#      'the_4th_holiday': '',
#      'therea1djones': '',
#      'tic_pix': '',
#      'tjmcconnell': '',
#      'tmac213': '',
#      'traeyoung': '',
#      'trezz24': '',
#      'tysonchandler': '',
#      'uabasketball': '',
#      'usabasketball': '',
#      'y0bull': '',
#      'youngheirgordon': '',
#      'youtube': '',
#      'yutawatanabe12': ''
# }

In [0]:
hashes_mat = train['hashes'].str.get_dummies()
at_signs_mat = train['at_signs'].str.get_dummies()

In [0]:
# hashes_mat.shape[1], at_signs_mat.shape[1]

In [0]:
# top_10 = np.argsort(np.sum(hashes_mat.values, axis=0))[::-1][:10]
# np.sum(at_signs_mat, axis=0)

In [0]:
# hashes_mat.head()

# [hashes_mat.columns[c] for c in top_10]

# hashes_mat.iloc[:,213]

In [0]:
def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

In [0]:
train['clean_des'] = train['Description'].apply(lambda x: clean_text(x))

In [56]:
train['clean_des'].head()

0    The raptors bench trio of sergeibaka normanpow...
1    kylelowry7 pulls from deep for the raptors in ...
2           kmid22 with some english on the bucks dime
3           Kawhi punches it home with the left on TNT
4    giannisan34 goes baseline early to rock the ri...
Name: clean_des, dtype: object

In [0]:
# # def ifnone(a,b):
# #     "`a` if `a` is not None, otherwise `b`."
# #     return b if a is None else a
  
# def emb_sz_rule(n_cat:int)->int: return min(600, round(1.6 * n_cat**0.56))

# def def_emb_sz(classes, n, sz_dict=None):
#     "Pick an embedding size for `n` depending on `classes` if not given in `sz_dict`."
#     sz_dict = ifnone(sz_dict, {})
#     n_cat = len(classes[n])
#     sz = sz_dict.get(n, int(emb_sz_rule(n_cat)))  # rule of thumb
#     return n_cat,sz
  
# def get_emb_szs(sz_dict=None):
#     "Return the default embedding sizes suitable for this data or takes the ones in `sz_dict`."
#     return [def_emb_sz(classes, n, sz_dict) for n in cat_names]  

# emb_sz_rule(384), emb_sz_rule(848)

In [0]:
# def_emb_sz(384, 7766)

# classes = ['car', 'plane', 'jet']

# labels = [0, 1, 1, 0, 0]

# X = ['car', 'plans', 'jet', 'plane', 'jet']

# def_emb_sz(X, 3)

# get_emb_szs()

In [0]:
# X = np.random.rand(hashes_mat.shape[1], emb_sz_rule(hashes_mat.shape[1]))

In [0]:
# X.shape

In [0]:
# train_data_X[0]