In [None]:
cd ..

In [2]:
import numpy as np
import pandas as pd
import time
import pickle as pkl
import itertools
from collections import Counter

from src.utils.helpers import *
from src.utils.eda_helpers import *
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, RandomizedSearchCV,StratifiedKFold
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, roc_curve, confusion_matrix, log_loss, f1_score, precision_recall_curve, make_scorer

import lightgbm as lgb

import warnings 
warnings.filterwarnings("ignore")  # To ignore any warnings 

seed = 123
pd.options.display.max_rows = None 

In [None]:
# load saved objects from 0.1-eda
vect_cols = pkl.load(open("./models/vect_cols.pkl",'rb'))
tfidf_cols = pkl.load(open("./models/tfidf_cols.pkl",'rb'))
labelencoder = pkl.load(open("./models/labelencoder.pkl",'rb'))

labels_df = pd.read_csv('./data/output/labels_df.csv')

In [3]:
features_df_raw = pd.read_csv('./data/features.csv')
features_df = features_df_raw.copy()
labels_df = labels_df_raw.copy()

In [4]:
features_df.head(2)

Unnamed: 0,trackID,title,tags,loudness,tempo,time_signature,key,mode,duration,vect_1,...,vect_139,vect_140,vect_141,vect_142,vect_143,vect_144,vect_145,vect_146,vect_147,vect_148
0,6654,Beside the Yellow Line,"i, the, to, and, a, me, it, not, in, my, is, o...",-8.539,104.341,3.0,7.0,1.0,298.73587,44.462048,...,0.000308,0.000302,0.000302,0.000315,0.000297,0.000305,0.000266,0.000225,0.130826,1.071914
1,5883,Ooh Na Na,"i, you, to, and, a, me, it, not, in, my, is, y...",-4.326,141.969,3.0,6.0,0.0,236.09424,46.069761,...,0.001751,0.001855,0.00192,0.00195,0.001937,0.001912,0.001836,0.00174,0.148765,0.882304


In [5]:
features_df.isnull().sum()/len(features_df)*100
# some missing values. will impute later

trackID           0.000000
title             0.086122
tags              0.147638
loudness          0.123031
tempo             0.135335
time_signature    0.110728
key               0.184547
mode              0.110728
duration          0.110728
vect_1            0.123031
vect_2            0.098425
vect_3            0.135335
vect_4            0.147638
vect_5            0.110728
vect_6            0.147638
vect_7            0.123031
vect_8            0.098425
vect_9            0.123031
vect_10           0.123031
vect_11           0.098425
vect_12           0.159941
vect_13           0.000000
vect_14           0.000000
vect_15           0.000000
vect_16           0.000000
vect_17           0.000000
vect_18           0.000000
vect_19           0.000000
vect_20           0.000000
vect_21           0.000000
vect_22           0.000000
vect_23           0.000000
vect_24           0.000000
vect_25           0.000000
vect_26           0.000000
vect_27           0.000000
vect_28           0.000000
v

In [6]:
features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Columns: 157 entries, trackID to vect_148
dtypes: float64(154), int64(1), object(2)
memory usage: 9.7+ MB


In [7]:
features_df.select_dtypes(include = ['float64']).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Columns: 154 entries, loudness to vect_148
dtypes: float64(154)
memory usage: 9.5 MB


In [8]:
features_df.select_dtypes(include = ['int64']).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   trackID  8128 non-null   int64
dtypes: int64(1)
memory usage: 63.6 KB


In [9]:
numeric_feat = features_df.select_dtypes(include = ['float64']).columns
int_feat = features_df.select_dtypes(include = ['int64']).columns

In [10]:
# convert data type of numeric columns in features_df to float16. quantization 
# convert id col to int16 instead of int8 to preserve data integrity
features_df[numeric_feat] = features_df[numeric_feat].astype(np.float16)
features_df[int_feat] = features_df[int_feat].astype(np.int16)

In [11]:
features_df['trackID'].values == features_df_raw['trackID'].values

array([ True,  True,  True, ...,  True,  True,  True])

In [12]:
all(True for element in features_df['trackID'].values == features_df_raw['trackID'].values)

True

In [13]:
features_df.loc[features_df['title'].isnull(), 'tags']

1960    i, the, you, to, and, a, me, it, not, in, my, ...
6000    i, the, you, to, and, a, me, it, not, in, is, ...
6624    i, the, you, to, and, a, me, it, not, in, is, ...
6895    i, the, you, to, and, a, it, not, in, is, your...
6957    i, the, you, to, and, a, me, it, not, in, my, ...
7774    i, the, you, to, and, a, it, not, in, my, of, ...
7820    i, the, you, to, and, a, it, in, my, is, of, y...
Name: tags, dtype: object

In [14]:
features_df.loc[features_df['tags'].isnull(), 'title']

752                             How Many Times
1348                                Thou Swell
1770                                      Fame
2594          Feeding The Mouth That Bites You
2792                 She Wakes When She Dreams
3872    Hollywood Liar (2004 Digital Remaster)
4955            Haven't You Heard (LP Version)
5389                            Morbid reality
6704                              Celtic Tiger
6761                             I Told You So
6915                       Black Is The Colour
8107                                Strip Song
Name: title, dtype: object

to investigate whether i, the, you, to, and, a, me, are common words in `tags`. 

In [17]:
# fill empty tags with `unk`
features_df.loc[features_df['tags'].isnull(), 'tags'] = 'unk'

In [18]:
features_df['tags'].isnull().sum()

0

In [19]:
Counter(" ".join(features_df["tags"]).split()).most_common(10)

[('the,', 6901),
 ('to,', 6546),
 ('a,', 6521),
 ('and,', 6424),
 ('i,', 6196),
 ('you,', 5949),
 ('in,', 5734),
 ('is,', 5488),
 ('it,', 5328),
 ('me,', 5267)]

In [66]:
n = 10
n_freq_words_in_tags = [word for word in Counter(" ".join(features_df["tags"]).split()).most_common(n)]
n_freq_words_in_tags

[('the,', 6901),
 ('to,', 6546),
 ('a,', 6521),
 ('and,', 6424),
 ('i,', 6196),
 ('you,', 5949),
 ('in,', 5734),
 ('is,', 5488),
 ('it,', 5328),
 ('me,', 5267)]

In [25]:
# find % that each word in freq words appear in data
for word in n_freq_words:
    print(word, len(features_df[features_df['tags'].str.contains(word)])/len(features_df) * 100)

the, 84.90403543307087
to, 85.17470472440945
a, 89.64074803149606
and, 82.08661417322834
i, 95.3371062992126
you, 73.19143700787401
in, 83.53838582677166
is, 78.56791338582677
it, 73.74507874015748
me, 85.8267716535433


In [None]:
# treat words that appear > 80% of the time as stopwords
# i create my own list of stopwords so as to avoid removing words that are part of nltk's stopwords 
# but are considered 'impt' in this use case

In [71]:
stopwords_list = []
for word in n_freq_words_in_tags:
    word = word[0].strip(',')
    if len(features_df[features_df['tags'].str.contains(word)])/len(features_df) * 100 > 80:
        stopwords_list.append(word)
stopwords_list

['the', 'to', 'a', 'and', 'i', 'in', 'is', 'it', 'me']

In [35]:
features_df['tokenized_title'] = features_df['title'].apply(lambda x:utils_preprocess_text(x))


In [36]:
# fill empty title with `unk`
features_df.loc[features_df['tokenized_title'].isnull(), 'tokenized_title'] = 'unk'

In [37]:
n_freq_words_in_title = [word[0] for word in Counter(" ".join(features_df["tokenized_title"]).split()).most_common(n)]
n_freq_words_in_title

['the,', 'version', 'of,', 'lp,', 'album,', 'a,', 'i,', 'to,', 'you,', 'in,']

In [39]:
# find % that each word in freq words appear in data
for word in n_freq_words_in_title:
    print(word, len(features_df[features_df['tokenized_title'].str.contains(word)])/len(features_df) * 100)
# since these words don't appear frequently, won't remove them from title

the, 13.324311023622048
version 10.593011811023622
of, 5.413385826771654
lp, 5.241141732283464
album, 4.9827755905511815
a, 9.104330708661418
i, 5.130413385826771
to, 4.416830708661417
you, 3.5063976377952755
in, 5.3641732283464565


since there are missing values for `title` and `tags`, but there are values for `titles` when `tags` are missing and vice versa, i'll concatenate `title` and `tags` as 1 attribute `title_and_tags`

In [72]:
features_df['tags_clean'] = features_df['tags'].apply(lambda x:utils_preprocess_text(x, lst_stopwords=stopwords_list))



In [73]:
features_df[['tags', 'tags_clean']].sample(10, random_state = seed)

Unnamed: 0,tags,tags_clean
186,"i, the, you, to, a, me, it, not, my, is, of, y...","you, not, my, of, your, that, do, on, are, we,..."
1435,"i, the, to, and, a, me, not, in, my, is, of, t...","not, my, of, that, on, am, will, for, be, so, ..."
4691,"i, you, and, a, me, it, in, my, your, on, are,...","you, my, your, on, are, will, for, be, love, s..."
4366,"i, the, you, to, and, a, in, my, of, your, tha...","you, my, of, your, that, do, on, are, we, am, ..."
2762,"i, the, you, to, and, a, me, it, not, is, of, ...","you, not, of, your, that, do, on, are, we, am,..."
1969,"i, the, you, to, and, a, it, not, in, is, of, ...","you, not, of, that, do, are, we, all, be, have..."
4426,"i, the, you, and, a, me, not, that, do, are, a...","you, not, that, do, are, am, will, be, what, t..."
2134,"i, the, you, to, and, a, me, it, not, in, my, ...","you, not, my, of, do, on, are, we, all, for, n..."
2367,"i, the, you, to, and, a, me, not, your, do, on...","you, not, your, do, on, am, will, all, for, no..."
5625,"i, the, you, to, and, of, do, on, are, will, b...","you, of, do, on, are, will, but, what, just, o..."


In [74]:
features_df['title_and_tags'] = features_df['tokenized_title'] + ', ' + features_df['tags_clean']
# check for nulls
features_df['title_and_tags'].isnull().sum()

0

In [75]:
corpus = features_df['title_and_tags'].astype(str)
TFidf = TfidfVectorizer(strip_accents = 'unicode', 
#                      stop_words = 'english',
                     lowercase = True,
                     analyzer = 'word',
                     token_pattern = r'\b[a-zA-Z]{3,}\b',
                     max_df = 0.7, 
                     min_df = 10, 
                     ngram_range=(1,2)) 
dtm_TFidf = TFidf.fit_transform(corpus)

In [76]:
dtm_TFidf.shape

(8128, 10250)

In [77]:
# convert document-term matrix into a df so that we can combine with original df
# add `d_` prefix as an identifier for easier filtering later 
title_and_tags_label = [e[:30]+"..." for e in corpus]
dtm_TFidf_df = pd.DataFrame(dtm_TFidf.toarray(), index=title_and_tags_label, columns=["d_" + str(i) for i in TFidf.get_feature_names()])


In [78]:
# quantize values
dtm_TFidf_df = dtm_TFidf_df.astype(np.float16)
dtm_TFidf_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8128 entries, beside, the, yellow, line, not... to whenever, youre, ready, you, n...
Columns: 10250 entries, d_aah to d_zwei
dtypes: float16(10250)
memory usage: 159.0+ MB


In [79]:
numeric_feat_and_id = list(int_feat) + list(numeric_feat)
len(numeric_feat_and_id) == len(list(int_feat)) + len(list(numeric_feat))

True

In [81]:
features_df[numeric_feat_and_id].shape

(8128, 155)

In [83]:
# combine dtm_TFidf_df with original df
dtm_TFidf_df['trackID'] = features_df['trackID'].values
features_df1 = features_df[numeric_feat_and_id].merge(dtm_TFidf_df, on = 'trackID')
features_df1.head(2)

Unnamed: 0,trackID,loudness,tempo,time_signature,key,mode,duration,vect_1,vect_2,vect_3,...,d_youth,d_yuh,d_zeit,d_zero,d_zone,d_zoo,d_zum,d_zur,d_zuruck,d_zwei
0,6654,-8.539062,104.3125,3.0,7.0,1.0,298.75,44.46875,-13.5,26.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5883,-4.324219,142.0,3.0,6.0,0.0,236.125,46.0625,16.984375,-1.875,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
features_df1.shape

(8128, 10405)

In [99]:
vect_cols = [col for col in features_df1.columns if col.startswith('vect')]
len(vect_cols)

148

In [100]:
tfidf_cols = [col for col in features_df1.columns if col.startswith('d_')]
len(tfidf_cols)

10250