In [1]:
cd ..

/Users/valerie/Desktop/DataScience/job-application/coding_assignments/ml/dbs_ds/stage1_sect_b


In [2]:
import pickle as pkl
import pandas as pd
import numpy as np
from src.utils.helpers import *
import sqlite3

In [3]:
# later reload the pickle file
vect_pca = pkl.load(open("./models/vect_pca.pkl",'rb'))
tfidf_pca = pkl.load(open("./models/tfidf_pca.pkl",'rb'))
scaler = pkl.load(open("./models/scaler.pkl",'rb'))
TFidf = pkl.load(open("./models/TFidf.pkl",'rb'))

# load saved objects from 0.1-eda
trained_vect_cols = pkl.load(open("./models/vect_cols.pkl",'rb'))
trained_tfidf_cols = pkl.load(open("./models/tfidf_cols.pkl",'rb'))
lgbm_cv_model_1 = pkl.load(open("./models/lgbm_cv_model_1.pkl",'rb'))
labelencoder = pkl.load(open("./models/labelencoder.pkl",'rb'))

In [4]:
df = pd.read_csv('./data/test.csv')
df.shape

(428, 157)

In [6]:
# check for nulls
df1 = df[~df.isin([np.nan, np.inf, -np.inf]).any(1)]
df1.shape

(428, 157)

In [7]:
df1.head(2)

Unnamed: 0,trackID,title,tags,loudness,tempo,time_signature,key,mode,duration,vect_1,...,vect_139,vect_140,vect_141,vect_142,vect_143,vect_144,vect_145,vect_146,vect_147,vect_148
0,6732,You Get What You Give,"i, the, you, to, and, a, me, it, not, in, is, ...",-5.672,113.941,4,2,1,300.82567,49.707499,...,0.000628,0.00066,0.00066,0.000667,0.000615,0.000582,0.000534,0.000476,0.21001,1.392651
1,5415,Greedee,"i, the, you, to, and, a, me, it, not, in, my, ...",-7.931,102.062,4,11,1,259.60444,46.829729,...,0.000414,0.000423,0.000428,0.000432,0.000416,0.000389,0.000359,0.000331,0.12408,0.604772


In [8]:
df1['tokenized_title'] = df1['title'].apply(lambda x:utils_preprocess_text(x))
title_and_tags_cols = ['tokenized_title', 'tags']
for col in title_and_tags_cols:
    df1.loc[df1[col].isnull(), col] = ''
df1['title_and_tags'] = df1['tokenized_title'] + ', ' + df1['tags']

In [9]:
assert df1['title_and_tags'].isnull().sum() == 0

In [10]:
corpus = df1['title_and_tags'].astype(str)
dtm_TFidf = TFidf.transform(corpus)

In [11]:
dtm_TFidf.shape

(428, 10268)

In [12]:
# convert document-term matrix into a df so that we can combine with original df
# add `d_` prefix as an identifier for easier filtering later 
title_and_tags_label = [e[:30]+"..." for e in corpus]
dtm_TFidf_df = pd.DataFrame(dtm_TFidf.toarray(), index=title_and_tags_label, columns=["d_" + str(i) for i in TFidf.get_feature_names()])


In [13]:
# quantize values
dtm_TFidf_df = dtm_TFidf_df.astype(np.float16)
dtm_TFidf_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 428 entries, you, get, what, you, give, i, ... to guitarra, mia, a, me, no, que,...
Columns: 10268 entries, d_aah to d_zwei
dtypes: float16(10268)
memory usage: 8.4+ MB


In [13]:
# dtm_TFidf_df.shape

(428, 876)

In [14]:
dtm_TFidf_df.shape

(428, 10268)

In [15]:
df1.shape

(428, 159)

In [16]:
dtm_TFidf_df['trackID'] = df1['trackID'].values

In [17]:
numeric_feat = df1.select_dtypes(include = ['float64']).columns
int_feat = df1.select_dtypes(include = ['int64']).columns

In [18]:
numeric_feat_and_id = list(int_feat) + list(numeric_feat)
len(numeric_feat_and_id) == len(list(int_feat)) + len(list(numeric_feat))

True

In [19]:
df1[numeric_feat_and_id].head(1)

Unnamed: 0,trackID,time_signature,key,mode,loudness,tempo,duration,vect_1,vect_2,vect_3,...,vect_139,vect_140,vect_141,vect_142,vect_143,vect_144,vect_145,vect_146,vect_147,vect_148
0,6732,4,2,1,-5.672,113.941,300.82567,49.707499,38.764963,33.624619,...,0.000628,0.00066,0.00066,0.000667,0.000615,0.000582,0.000534,0.000476,0.21001,1.392651


In [20]:
# combine dtm_TFidf_df with original df
df2 = df1[numeric_feat_and_id].merge(dtm_TFidf_df, on = 'trackID')
assert len(df2) == len(df1)

process data for modeling

In [21]:
test_vect_cols = [col for col in df2.columns if col.startswith('vect')]
test_tfidf_cols = [col for col in df2.columns if col.startswith('d_')]

In [22]:
df2.head(1)

Unnamed: 0,trackID,time_signature,key,mode,loudness,tempo,duration,vect_1,vect_2,vect_3,...,d_youth,d_yuh,d_zeit,d_zero,d_zone,d_zoo,d_zum,d_zur,d_zuruck,d_zwei
0,6732,4,2,1,-5.672,113.941,300.82567,49.707499,38.764963,33.624619,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
absent_tfidf_cols = [col for col in trained_tfidf_cols if col not in test_tfidf_cols]
absent_vect_cols = [col for col in trained_vect_cols if col not in test_vect_cols]

In [25]:
if len(absent_tfidf_cols) != 0:
    for col in absent_tfidf_cols:
        df2[col] = 0
else:
    pass

In [26]:
if len(absent_vect_cols) != 0:
    for col in absent_vect_cols:
        df2[col] = 0
else:
    pass

In [27]:
df2.shape

(428, 10423)

In [28]:
features_to_scale = trained_tfidf_cols + trained_vect_cols

In [29]:
# apply feature scaling
X_scaled_cols_df = pd.DataFrame(scaler.transform(df2[features_to_scale].to_numpy().astype(np.float64)), columns=[str(i) + "_scaled" for i in features_to_scale])
vect_scaled_cols = [str(i) + "_scaled" for i in trained_vect_cols]
tfidf_scaled_cols = [str(i) + "_scaled" for i in trained_tfidf_cols]

# apply pca to vect and tfidf cols
X_test_vect_pca = vect_pca.transform(X_scaled_cols_df[vect_scaled_cols])
X_test_tfidf_pca = tfidf_pca.transform(X_scaled_cols_df[tfidf_scaled_cols])

vect_pca_n_components = 10
tfidf_pca_n_components = 500

# convert np array to df so as to combine with master_df with non scaled cols
X_test_vect_pca_df = pd.DataFrame(X_test_vect_pca, columns=["vect_pca_" + str(i) for i in range(vect_pca_n_components)])
X_test_tfidf_pca_df = pd.DataFrame(X_test_tfidf_pca, columns=["tfidf_pca_" + str(i) for i in range(tfidf_pca_n_components)])


In [30]:
model_features = [col for col in df2.columns if col not in ('trackID')]
non_scaled_cols = [col for col in model_features if col not in trained_vect_cols if col not in trained_tfidf_cols if col not in test_vect_cols if col not in test_tfidf_cols]
non_scaled_cols

['time_signature', 'key', 'mode', 'loudness', 'tempo', 'duration']

In [31]:
# # combine our pca df with original df that i didn't scale
X_test = pd.concat([df2[non_scaled_cols], X_test_vect_pca_df, X_test_tfidf_pca_df], axis=1)
# check if same number of rows after join
len(X_test) == len(df2)

True

In [32]:
assert len(X_test) == len(df2)
print(X_test.shape)

(428, 516)


In [33]:
y_gbm_test_preds = lgbm_cv_model_1.predict(X_test).round(1)
y_gbm_test_labels = labelencoder.inverse_transform(y_gbm_test_preds)

In [34]:
assert len(y_gbm_test_labels) == len(X_test)

In [35]:
results =pd.DataFrame({"trackID": df1['trackID'], "title": df1['title'], "predictions": y_gbm_test_labels})
results.head(2)

Unnamed: 0,trackID,title,predictions
0,6732,You Get What You Give,metal
1,5415,Greedee,metal


In [36]:
results.to_csv('./data/output/predictions.csv', index=False)

# upload df to sqlite db
con = sqlite3.connect("./music.db")
cursor = con.cursor()
results.to_sql('titles_and_genre', con, index=False) #'titles_and_genre' will be the table name
    
con.commit()
cursor.close()