In [1]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Imports

In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import operator 
import os
import gc
import re

from tqdm import tqdm
tqdm.pandas()

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import scale, minmax_scale
from scipy.stats import norm

import lightgbm as lgb

import warnings
warnings.simplefilter(action='ignore')
warnings.filterwarnings('ignore')

# change to path
PATH='/content/drive/My Drive/Colab Notebooks/nba/'
os.chdir(PATH)

## Read in

In [3]:
train = pd.read_csv(f'{PATH}/dataset/training_set.csv', encoding = 'ISO-8859-1')
test = pd.read_csv(f'{PATH}/dataset/holdout_set.csv', encoding = 'ISO-8859-1')

train.head()

Unnamed: 0,Engagements,Followers at Posting,Created,Type,Description
0,502093,36984682,2019-05-21 23:30:51 EDT,Video,The @raptors bench trio of @sergeibaka @norman...
1,603380,36984682,2019-05-21 22:53:33 EDT,Video,@kyle_lowry7 pulls from deep for the @raptors ...
2,603380,36984682,2019-05-21 22:19:58 EDT,Video,@k_mid22 with some english on the @bucks dime!
3,725100,36984682,2019-05-21 22:02:41 EDT,Video,Kawhi punches it home with the left on TNT!
4,661446,36984682,2019-05-21 20:47:49 EDT,Video,@giannis_an34 goes baseline early to rock the ...


In [4]:
len(train), len(test)

(7766, 1000)

In [5]:
# keep track of the len of train
# so that we can split again after 
# this operation
train_len = len(train)

df = pd.concat([train, test])

df.shape

(8766, 5)

## Preprocessing

In [6]:
print('Missing descriptions:', df['Description'].isna().sum())

df['Description'].fillna('', inplace=True)

print('Missing descriptions:', df['Description'].isna().sum())

Missing descriptions: 14
Missing descriptions: 0


## Time feature

*   Hour, minute, second
*   Day of the week
*   Month
*   Quarter
*   Year
*   Number of posts on the day
*   Number of posts 2 days before (since playoffs have a one-day gap)



In [0]:
def str_to_date(s, split):
    return s.split(' ')[split]
  
def add_datepart(df, fldname, drop=True, time=False):
    """
    Taken from fast.ai
    Helper function that adds columns relevant to a date.
    """
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)
      
def get_time_split(time):
    time_minute = time.split(':')[:2]
    return int(''.join(time_minute))
  
def get_hour(time):
    return int(time.split(':')[0])
  
def get_minute(time):
    return round(int(time.split(':')[1]), -1)

In [0]:
def preprocess(df):
    df['date'] = df.apply(lambda x: str_to_date(x['Created'], 0), axis=1)
    df['time'] = df.apply(lambda x: str_to_date(x['Created'], 1), axis=1)
    df['tz'] = df.apply(lambda x: str_to_date(x['Created'], 2), axis=1)
    
    # get more features on date
    df['date'] = pd.to_datetime(df['date'])
    
    # date
    df['date_tsfm'] = pd.to_datetime(df['date'])
    
    add_datepart(df, 'date_tsfm')

    time_df = pd.get_dummies(df['Type'], 
                         prefix='Type')

    # get only time and hour
    df['time_split'] = df['time'].apply(get_time_split)
    df['hour'] = df['time'].apply(get_hour)
    df['minute'] = df['time'].apply(get_minute)   
    
    df = pd.concat([df, time_df],
                    axis=1)
    
    return df

In [0]:
df = preprocess(df)

To get the number of posts on the day, check to see how many posts were posted on each day.

In [0]:
# get the number of posts
# for each day
date_counts = df.groupby('date')['time'].count().to_dict()

# create a new feature
# to map the number of posts 
# of the day
df['num_posts'] = df['date'].map(date_counts)

In [0]:
# get the date for 2 days ago
df['date_lag_2'] = df['date'] - np.timedelta64(2, 'D')

# create a new feature
# to map the number of posts
# 2 days ago
df['num_posts_lag_2'] = df['date_lag_2'].map(date_counts)

# replace the missing days 
# for a 2-day lag with
# the same number of posts on 
# the same day
df['num_posts_lag_2'] = df['num_posts_lag_2'].fillna(df['num_posts'])

In [12]:
# check if there is any missing values
len(df[df['num_posts_lag_2'].isna()])

0

Below code is mostly taken from https://www.kaggle.com/theoviel/improve-your-score-with-some-text-preprocessing.

In [0]:
def load_embed(file=None):
    embeddings_index = {}
    
    if file is None: f = open('crawl-300d-2M.vec', encoding="utf8")
      
    else: f = open(file, encoding="utf8")
      
    count = 0
    
    for line in tqdm(f):    
        count += 1
        if count == 500000: 
            break
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    
    return embeddings_index

In [14]:
embeddings_index = load_embed()

499872it [00:33, 15174.42it/s]

In [0]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for {:.2%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words

In [16]:
vocab = build_vocab(df['Description'])
oov_fasttext = check_coverage(vocab, embeddings_index)

Found embeddings for 37.85% of vocab
Found embeddings for 62.93% of all text


In [0]:
def add_lower(embedding, vocab):
    count = 0
    for word in vocab:
        if word in embedding and word.lower() not in embedding:  
            embedding[word.lower()] = embedding[word]
            count += 1
    print(f"Added {count} words to embedding")
    
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }

def unknown_punct(embed, punct):
    unknown = ''
    for p in punct:
        if p not in embed:
            unknown += p
            unknown += ' '
    return unknown

def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  # Other special characters that I have to deal with in last
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text    
  
contraction_mapping = {"tonight's": "tonight is",
                       "nbaplayoffs": "NBA playoffs",
                       "thisiswhyweplay": "this is why we play",
                       "kingjames": "lebron james",
                       "nbaallstar": "NBA all-star",
                       "nbaonabc": "NBA on ABC",
                       "nbaontnt": "NBA on TNT",
                       "nbapreseason": "NBA preseason",
                       "nbatv": "NBA TV",
                       "nbafinals": "NBA finals",
                       "nbabreakdown": "NBA breakdown",
                       "nbaonespn": "NBA on ESPN",
                       "russwest44": "russell westbrook",
                       "kyrieirving": "kyrie irving",
                       "nbakicks": "NBA kicks",
                       "nbaday": "NBA birthday",
                       "nbasummer": "NBA summer",
                       "tripledoublealert": "triple double alert",
                       "phantomcam": "phantom camera"}

def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text    

In [0]:
df['lowered_des'] = df['Description'].apply(lambda x: x.lower())
df['treated_des'] = df['lowered_des'].apply(lambda x: clean_special_chars(x, punct, punct_mapping))
df['treated_des'] = df['treated_des'].str.replace("\x92", "'")
df['treated_des'] = df['treated_des'].apply(lambda x: clean_contractions(x, contraction_mapping))

In [19]:
vocab_low = build_vocab(df['treated_des'])
oov_fasttext = check_coverage(vocab_low, embeddings_index)

Found embeddings for 68.96% of vocab
Found embeddings for 91.75% of all text


In [20]:
oov_fasttext[:10]

[('houstonrockets', 350),
 ('stephencurry30', 244),
 ('okcthunder', 242),
 ('jharden13', 238),
 ('giannis', 216),
 ('kiatipoff18', 204),
 ('an34', 197),
 ('bensimmons', 171),
 ('dwyanewade', 142),
 ('utahjazz', 119)]

In [0]:
def generate_doc_vectors(s, embeddings_index, n=300):
    words = str(s).split() 
    M = []
    for w in words:
        if w in embeddings_index:
            M.append(embeddings_index[w])
    v = np.array(M).sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(n)
    return v / np.sqrt((v ** 2).sum())   

In [0]:
fasttext_embeddings = [generate_doc_vectors(x, embeddings_index) for x in df['treated_des']]  

In [0]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100, n_iter=10, random_state=42)
smaller_d_matrix = svd.fit_transform(fasttext_embeddings)

In [24]:
fts = ['ft_' + str(i) for i in range(100)]

fasttext_ = pd.DataFrame(smaller_d_matrix,
                         columns=fts)

fasttext_.head()

Unnamed: 0,ft_0,ft_1,ft_2,ft_3,ft_4,ft_5,ft_6,ft_7,ft_8,ft_9,ft_10,ft_11,ft_12,ft_13,ft_14,ft_15,ft_16,ft_17,ft_18,ft_19,ft_20,ft_21,ft_22,ft_23,ft_24,ft_25,ft_26,ft_27,ft_28,ft_29,ft_30,ft_31,ft_32,ft_33,ft_34,ft_35,ft_36,ft_37,ft_38,ft_39,...,ft_60,ft_61,ft_62,ft_63,ft_64,ft_65,ft_66,ft_67,ft_68,ft_69,ft_70,ft_71,ft_72,ft_73,ft_74,ft_75,ft_76,ft_77,ft_78,ft_79,ft_80,ft_81,ft_82,ft_83,ft_84,ft_85,ft_86,ft_87,ft_88,ft_89,ft_90,ft_91,ft_92,ft_93,ft_94,ft_95,ft_96,ft_97,ft_98,ft_99
0,0.900021,-0.123963,0.053059,0.164597,0.117203,-0.023572,-0.020944,-0.059384,-0.09682,-0.074651,-0.038005,-0.03916,0.026814,0.018533,0.002958,-0.032089,-0.027647,-0.090392,-0.033635,-0.059815,0.014775,-0.080856,0.015871,-0.053613,-0.054047,0.022779,0.022503,-0.021724,-0.035947,0.034773,-0.019899,0.024682,0.029204,-0.006517,0.006166,-0.021596,0.104502,0.02689,-0.043009,-0.00137,...,-0.000201,0.024806,-0.024466,-0.039729,0.006184,0.035928,0.003495,0.037478,-0.00444,0.033272,0.002643,-0.039286,-0.007366,-0.000835,0.008904,-0.009111,-0.012658,-0.012259,-0.028452,-0.018604,0.009019,0.033939,-0.001735,-0.010186,0.016099,-0.009234,-0.008947,-0.03048,0.00732,0.0161,-0.004636,0.01132,0.020995,-0.027499,-0.035389,0.004619,0.019185,-0.012531,0.020779,-0.030287
1,0.901574,-0.13057,-0.02862,0.069442,-0.0587,-0.098998,0.046605,-0.055721,0.021555,-0.06494,-0.012715,0.077795,-0.019317,-0.053505,0.079194,-0.001566,0.04767,0.011479,-0.013259,-0.045828,0.001469,-0.038324,-0.003603,0.021179,-0.047913,0.077984,-0.057694,-0.076931,0.013463,0.070883,-0.029806,-0.008155,0.040269,-0.055962,0.047325,-0.013427,-0.02112,0.03844,0.00608,0.091709,...,-0.001825,0.012631,0.058502,-0.036006,-0.024932,0.033911,0.002438,0.053357,-0.003141,0.039433,0.002303,0.050365,0.014067,0.020647,0.008485,-0.009602,-0.031008,-0.011572,-0.016673,-0.000787,-0.03489,-0.001635,-0.006621,-0.045308,0.00424,-0.027591,-0.041808,-0.016643,0.031465,0.021865,-0.031416,0.012918,0.006608,0.000178,4.4e-05,0.005649,0.026726,-0.002392,-0.024489,-0.035417
2,0.865843,-0.026643,-0.054495,0.18506,0.025547,-0.052126,0.066489,-0.003877,-0.022829,0.041067,-0.043236,0.08962,-0.011866,0.034888,-0.054367,-0.010193,0.087626,-0.034817,0.035166,-0.033228,0.057829,-0.063466,-0.010572,0.112815,-0.031203,0.001177,0.024159,-0.061581,0.004251,0.045807,-0.025716,0.111259,0.001508,0.03093,0.033374,-0.054933,-0.052522,-0.031132,-0.001429,0.035384,...,-0.000263,-0.034996,-0.064422,-0.005488,0.011835,-0.033743,-0.015327,-0.062884,-0.029306,-0.054025,0.032376,0.010844,-0.005957,0.030948,-0.020716,0.015364,0.053825,0.007366,0.013073,0.039079,0.015196,-0.010877,-0.014947,0.006728,0.011063,-0.045819,0.008576,-0.020559,0.010762,-0.010085,0.004775,0.019782,0.009446,-0.003095,0.049118,-0.017012,-0.006196,0.005205,0.004224,-0.009447
3,0.763294,-0.108123,-0.269621,-0.095819,-0.117773,-0.143957,-0.011994,0.081971,0.11804,0.057663,-0.008476,0.094144,-0.048065,-0.023622,0.024584,0.097156,0.154727,-0.06977,0.006442,0.070993,0.047506,-0.041391,-0.028561,-0.009832,0.009632,0.004026,-0.056553,-0.077328,0.05945,0.006378,-0.059607,-0.05123,-0.024429,-0.054509,0.006634,0.049873,-0.027097,0.040139,0.000373,0.012441,...,0.005941,-0.039928,0.063592,0.011597,-0.016868,-0.03234,-0.007043,0.026309,-0.000248,-0.023055,0.045919,0.006656,0.053607,0.003526,0.01035,0.005066,-0.001753,-0.049094,-0.013172,-0.029083,-0.030561,-0.002254,0.026955,-0.050523,0.025646,-0.026778,-0.034774,-0.018799,-0.021068,0.027082,-0.039338,0.030443,0.028456,-0.071875,0.000524,0.026327,0.020942,0.003146,-0.009725,0.013866
4,0.835943,-0.136959,-0.090244,0.009891,-0.070989,-0.150583,-0.025506,0.043901,0.107338,0.001554,0.035555,0.087576,-0.058806,-0.077682,0.025631,0.07688,0.129569,-0.035183,-0.043833,-0.013017,-0.057592,0.043694,0.032782,-0.04867,-0.019265,0.008816,0.083929,0.006374,0.029176,-0.006595,-0.074841,0.029261,0.024768,-0.044035,0.032219,0.001465,-0.090518,0.054782,0.051868,0.110184,...,-0.04222,-0.001468,0.064095,0.102091,-0.096326,-0.045921,-0.02425,0.067333,0.001851,0.028102,-0.00249,0.016624,-0.003959,0.006823,-0.020192,0.027579,0.040009,0.011168,0.023648,0.044,-0.01027,0.040175,0.003085,0.018496,-0.000647,-0.015452,-0.097684,-0.061246,0.00192,-0.010689,-0.01328,0.007484,-0.009709,-0.025656,0.019988,0.032603,0.01528,0.006205,0.013783,-0.006673


In [25]:
fasttext_.shape, df.shape

((8766, 100), (8766, 32))

In [0]:
df = df.reset_index(drop=True)

In [0]:
df = pd.concat([df, fasttext_], 
                axis=1)

## Text features
Try to extract the following feature for each document - `Description`.
*   Number of #
*   Number of @
*   Number of punctuations
*   Number of uppercase letters
*   Number of tokens



In [0]:
def count_hashes(s):
    return s.count('#')
  
def count_at_signs(s):
    return s.count('@')
  
# spacy  
def count_punct(s):
    count = 0
    punct = '?!.,"$%\'()*+-/:;<=>[\\]^_`{|}~' + '“”’'    
    for p in punct: 
        if p in s: 
          count += 1    
    return count
  
def count_tokens(s):
    return len(s.split())
  
def count_upper(s):
    return len(re.findall(r'[A-Z]',s))

# find scores in descriptions  
def find_scores(s):
    scores = re.findall(r'\d{1,3}-\d{1,3}', s)
        
    if len(scores) > 0:
        for score in scores:
          if score[0][0] == '0' or score[0][0] == '9': # remove patterns like 2018-19
            scores.remove(score)
        return scores        

    else:
      return []  

In [0]:
def best_of_7_score(s):
    if len(s) == 0: return -999
    else:
        for score in s:
            s1, s2 = score.split('-')
            diff = abs(int(s1)-int(s2))
            if diff <= 3:
              return diff
            else:
              continue
        return -999
  
def game_score(s):
    if len(s) == 0: return -999
    else:
        for score in s:
            s1, s2 = score.split('-')
            s1, s2 = int(s1), int(s2)
            if s1 > 50 and s2 > 50:
                return abs(s1-s2)
            else:
                continue
        return -999 

In [0]:
# df['Description'].isna().sum()

# df.head()

In [0]:
def text_features(df):
    df['num_hashes'] = df['Description'].apply(count_hashes)
    df['num_at_signs'] = df['Description'].apply(count_at_signs)
    df['num_punct'] = df['Description'].apply(count_punct)
    df['num_tokens'] = df['Description'].apply(count_tokens)
    df['num_upper'] = df['Description'].apply(count_upper)
    df['scores'] = df['Description'].apply(find_scores)
    df['bo7'] = df['scores'].apply(best_of_7_score)
    df['game_score'] = df['scores'].apply(game_score)
    
text_features(df)

In [0]:
def clean_text(x):
    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x
  
# train['clean_des'] = train['Description'].apply(lambda x: clean_text(x))

In [0]:
# from spacy.lang.en import English
# import spacy

# !spacy download en_core_web_md

# # spacy.load('en_core_web_md') # not working

# nlp = en_core_web_md.load()

# from spacy.tokenizer import Tokenizer
# tokenizer = Tokenizer(nlp.vocab)

# from spacy.attrs import ORTH, LEMMA

# case = [{ORTH: "@"}]

# tokenizer.add_special_case("@", case)

# # Construction from class
# from spacy.pipeline import Sentencizer
# sentencizer = Sentencizer()

# nlp.add_pipe(sentencizer)
# nlp.add_pipe(tokenizer)

In [0]:
# ex = train.iloc[0]['clean_des']

# ex

In [0]:
# def extract_doc_features(text):
#     doc = nlp(text)
    
#     # original doc len
#     doc_len = len(doc)
    
#     # clean doc
#     clean_doc = clean_text(text)
    
#     # clean doc len
#     clean_doc_len = len(clean_doc.split())
    
#     print(clean_doc.split())

#     for i, token in enumerate(tokenizer(clean_doc)):
#         print(i, token)
    
#     for token in doc:
#         print(token.text, token.tag_, token.pos_, token.head, token.is_stop, token.ent_type_)

# extract_doc_features(ex)

In [38]:
gc.collect()

258

## Ideas

*   Tokenize text to get NBA team's Instagram. Perform `OneHotEncoding` on names.
*   Tokenize text to get players's Instagram. Perform `OneHotEncoding` on names. Mask names with `[TEAM]_player`.
*   Time, i.e., before or after game, playoffs or regular.
*   Understand how is `Engagement` generated.
*   Closeness of score.
*   Train FastText
https://fasttext.cc/docs/en/unsupervised-tutorial.html on `@nba` captions.

In [0]:
def get_at_sign(s):
    ls =  re.findall(r"@(\w+)", s)
    return '|'.join(ls)

def get_hash(s):
    ls = re.findall(r"#(\w+)", s)
    return '|'.join(ls)

In [40]:
%%time
df['hashes'] = df['Description'].apply(get_hash)
df['at_signs'] = df['Description'].apply(get_at_sign)

CPU times: user 29.9 ms, sys: 0 ns, total: 29.9 ms
Wall time: 32.9 ms


For all the `hashes` and `at_signs`, remove those that appear 3 times or fewer for `hashes` and 10 times or fewer for `at_signs`. 

In [0]:
hashes_mat = df['hashes'].str.get_dummies()
at_signs_mat = df['at_signs'].str.get_dummies()

In [42]:
hashes_mat.head()

Unnamed: 0,1,100,12,15,18HoopClass,19HoopClass,2,2019,21,22,23,24,2WayPlayer,3,34,4,4MrThunder,5,50,8,82,ADayWithGrayson,ADayWithMo,AIRMAX720,ATTSlamDunk,ATTvip,AirplaneMode,AllEyesNorth,AllForOne,AmericasTeamCamp,AmexNBA,AndreIngram,AssistsOfTheWeek,AssistsoftheWeek,BANG,BBQChickenAlert,BESTofNBA,BESTofNBA2017,BHM,BWBAfrica,...,WINorGoHome,WNBA,WNBA3Point,WNBAAllStar,WNBADraft,WNBAFinals,WNBAPlayoffs,WNBAPride,WadeCounty,WallWay,WarriorsAllAccess,WarriorsParade,WatchMeWork,WeBelieve,WeTheNorth,WeekofGreatness,WhateverItTakes,WhiteHot,WhyNot,WhyNotTour,WiredDifferent,WorldSmileDay,WorldTrickShotDay,Year16,fannypack,feathery,globalgame,kiatipoff18,ko8e24,nba2kleague,nbabreakdown,nbacanadaseries,nbapreseason,nbarooks,shammgod,thatSLOWgrind,themambamentality,thisiswhyweplay,wnbaallstar,wnbaplayoffs
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [43]:
at_signs_mat.head()

Unnamed: 0,1ingram4,1jordanbell,1ngram4,1tylerennis,1tyus,22wiggins,23,24baze,35_fitz,3dtv,3jmccollum,3tross1,44bojan,50mejri,7tlc,ATLHawks,ATT,AdidasHoops,AmericanExpress,BenSimmons,BlakeGriffin23,BreannaStewart30,BrooklynNets,Bucks,BudweiserUSA,CP3,Cavs,Celtics,DallasMavs,DallasWings,DamianLillard,DeAndre,DejounteMurray,DetroitPistons,DwightHoward,DwyaneWade,ESPN,FIBA,FS1,Fergie,...,unclejeffgreen,unclejg8,underarmour,unitedmasters,usabasketball,utahjazz,utahjazzgaming,utahjazzsl,vicoladipo,waiters3,waltdisneyworld,warriors,washmystics,washwizards,wayne_elli,wayneseldenjr,wendellcarterjr,wenyengabriel,wessywes23,wholeteamdot,whynotfoundation,willthethrillb5,willyhernangomez,winnieharlow,wisconsinherd,wnba,y0bull,yankees,ygtrece,youngamechanger,youngheirgordon,youtube,yutawatanabe12,zach_ballin30,zachcollins_33,zachlavine8,zazapachulia,zhaire_smith,zmane2,zo
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [44]:
idx_keep = list(np.where(np.sum(hashes_mat.values, axis=0) > 15)[0])
hashes_keep = [hashes_mat.columns[i] for i in idx_keep]

idx_keep = list(np.where(np.sum(at_signs_mat.values, axis=0) > 55)[0])
at_signs_keep = [at_signs_mat.columns[i] for i in idx_keep]

len(hashes_keep), len(at_signs_keep)

(54, 57)

This approach reduces the number of `hashes` from 384 to 133 and the number of `at_signs` from 848 to 159. Next, I encode those words that appear less often as rare `RARE`.

In [45]:
hashes_remove = [c for c in hashes_mat.columns if c not in hashes_keep]
at_signs_remove = [c for c in at_signs_mat.columns if c not in at_signs_keep]

len(hashes_remove), len(at_signs_remove)

(355, 832)

In [0]:
# sum across all the rare words 
# for hashes and at signs
hashes_rare = hashes_mat[hashes_remove].sum(axis=1).values
at_signs_rare = at_signs_mat[at_signs_remove].sum(axis=1).values

In [0]:
hashes = hashes_mat[hashes_keep]
hashes['hashes_rare'] = hashes_rare

at_signs = at_signs_mat[at_signs_keep]
at_signs['at_signs_rare'] = at_signs_rare

In [48]:
hashes.shape, at_signs.shape, df.shape

((8766, 55), (8766, 58), (8766, 142))

In [0]:
df = pd.concat([df, hashes, at_signs],
                axis=1)

In [50]:
df.shape

(8766, 255)

## Save the file

In [51]:
df.head()

Unnamed: 0,Engagements,Followers at Posting,Created,Type,Description,date,time,tz,date_tsfmYear,date_tsfmMonth,date_tsfmWeek,date_tsfmDay,date_tsfmDayofweek,date_tsfmDayofyear,date_tsfmIs_month_end,date_tsfmIs_month_start,date_tsfmIs_quarter_end,date_tsfmIs_quarter_start,date_tsfmIs_year_end,date_tsfmIs_year_start,date_tsfmElapsed,time_split,hour,minute,Type_Album,Type_Photo,Type_Video,num_posts,date_lag_2,num_posts_lag_2,lowered_des,treated_des,ft_0,ft_1,ft_2,ft_3,ft_4,ft_5,ft_6,ft_7,...,jharden13,joelembiid,kevindurant,kingjames,kyle_lowry7,kyrieirving,laclippers,lakers,lukadoncic,miamiheat,nbaallstar,nbahistory,nbaonespn,nbaontnt,nbasummerleague,nbatv,nuggets,nyknicks,okcthunder,orlandomagic,pacers,pelicansnba,raptors,russwest44,sacramentokings,sixers,spidadmitchell,spurs,stephencurry30,suns,timberwolves,traeyoung,trailblazers,utahjazz,vicoladipo,warriors,washwizards,ygtrece,zo,at_signs_rare
0,502093.0,36984682,2019-05-21 23:30:51 EDT,Video,The @raptors bench trio of @sergeibaka @norman...,2019-05-21,23:30:51,EDT,2019,5,21,21,1,141,False,False,False,False,False,False,1558396800,2330,23,30,0,0,1,19,2019-05-19,18.0,the @raptors bench trio of @sergeibaka @norman...,the @ raptors bench trio of @ sergeibaka @ ...,0.900021,-0.123963,0.053059,0.164597,0.117203,-0.023572,-0.020944,-0.059384,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
1,603380.0,36984682,2019-05-21 22:53:33 EDT,Video,@kyle_lowry7 pulls from deep for the @raptors ...,2019-05-21,22:53:33,EDT,2019,5,21,21,1,141,False,False,False,False,False,False,1558396800,2253,22,50,0,0,1,19,2019-05-19,18.0,@kyle_lowry7 pulls from deep for the @raptors ...,@ kyle - lowry7 pulls from deep for the @ ...,0.901574,-0.13057,-0.02862,0.069442,-0.0587,-0.098998,0.046605,-0.055721,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,603380.0,36984682,2019-05-21 22:19:58 EDT,Video,@k_mid22 with some english on the @bucks dime!,2019-05-21,22:19:58,EDT,2019,5,21,21,1,141,False,False,False,False,False,False,1558396800,2219,22,20,0,0,1,19,2019-05-19,18.0,@k_mid22 with some english on the @bucks dime!,@ k - mid22 with some english on the @ buc...,0.865843,-0.026643,-0.054495,0.18506,0.025547,-0.052126,0.066489,-0.003877,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,725100.0,36984682,2019-05-21 22:02:41 EDT,Video,Kawhi punches it home with the left on TNT!,2019-05-21,22:02:41,EDT,2019,5,21,21,1,141,False,False,False,False,False,False,1558396800,2202,22,0,0,0,1,19,2019-05-19,18.0,kawhi punches it home with the left on tnt!,kawhi punches it home with the left on tnt !,0.763294,-0.108123,-0.269621,-0.095819,-0.117773,-0.143957,-0.011994,0.081971,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,661446.0,36984682,2019-05-21 20:47:49 EDT,Video,@giannis_an34 goes baseline early to rock the ...,2019-05-21,20:47:49,EDT,2019,5,21,21,1,141,False,False,False,False,False,False,1558396800,2047,20,50,0,0,1,19,2019-05-19,18.0,@giannis_an34 goes baseline early to rock the ...,@ giannis - an34 goes baseline early to roc...,0.835943,-0.136959,-0.090244,0.009891,-0.070989,-0.150583,-0.025506,0.043901,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [52]:
train = df[:train_len]
test = df[train_len:]

len(train), len(test)

(7766, 1000)

In [0]:
train.to_csv(f'{PATH}/dataset/train_155.csv', index=False)
test.to_csv(f'{PATH}/dataset/test_155.csv', index=False)

In [0]:
# codes = {
#     PHI	76ers
#     MIL	Bucks
#     CHI	Bulls
#     CLE	Cavaliers
#     BOS	Celtics
#     LAC	Clippers
#     MEM	Grizzlies
#     ATL	Hawks
#     MIA	Heat
#     CHA	Hornets
#     UTA	Jazz
#     SAC	Kings
#     NYK	Knicks
#     LAL	Lakers
#     ORL	Magic
#     DAL	Mavericks
#     BKN	Nets
#     DEN	Nuggets
#     IND	Pacers
#     NOP	Pelicans
#     DET	Pistons
#     TOR	Raptors
#     HOU	Rockets
#     SAS	Spurs
#     PHX	Suns
#     OKC	Thunder
#     MIN	Timberwolves
#     POR	Trailblazers
#     GSW	Warriors
#     WAS	Wizards
# }