In [34]:
import re
import torch
import transformers
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

from transformers import BertModel, BertForMaskedLM
from transformers import BertTokenizer
from tqdm import notebook

from numpy import asarray
from numpy import savetxt

In [2]:
pip install matplotlib

Note: you may need to restart the kernel to use updated packages.


In [3]:
meta = pd.read_json('D:/programs/projects/yandex_music/meta.json', lines= True, convert_dates=['dttm'])
meta.info()
display(meta.sample(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71769 entries, 0 to 71768
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   track_id  71768 non-null  object        
 1   dttm      71768 non-null  datetime64[ns]
 2   title     71768 non-null  object        
 3   language  21969 non-null  object        
 4   isrc      71455 non-null  object        
 5   genres    71768 non-null  object        
 6   duration  71768 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(5)
memory usage: 3.8+ MB


Unnamed: 0,track_id,dttm,title,language,isrc,genres,duration
39871,e0e268e5ef7845eca17676d2b4366e3c,2021-11-14 23:40:14,Oi Sumida,,QZNJY2171986,"[FOLK, LATINFOLK]",188360.0
38674,226ce12e2514be4e2617e12f3f0eae91,2021-11-01 19:54:13,Lifted Veil,,QZPJ32116924,"[ALTERNATIVE, CLASSICAL, CLASSICALMUSIC]",138190.0
27011,38237508ef2da89e6c1fad7fa33d5a3a,2021-07-06 16:38:30,Zaplakace tvoji stari,,ATAJ12100475,[FOLK],214410.0
32788,4494e62aef568f952a9cb06295be0c26,2021-09-04 00:05:18,Maluquito Pero Sabroso,,QZNJV2194969,"[FOLK, LATINFOLK]",279250.0
19206,705df018633eaeececf4bab1d3134e9d,2020-04-22 10:53:55,Синие реки неба,,RUA1H2079691,"[NEWAGE, ELECTRONICS]",354440.0


In [4]:
meta['isrc'] = meta['isrc'].str.replace("-","")
meta.drop_duplicates(subset = 'isrc', ignore_index = True)
meta = meta.dropna(subset=['isrc']).reset_index(drop=True)
meta['isrc_year'] = meta['isrc'].str[5:7]
meta = meta.astype({'isrc_year':'int'})
for i in range(len(meta.index)):
    if meta.loc[i,'isrc_year']>23:
        meta.loc[i,'isrc_year'] = meta.loc[i,'isrc_year']+1900
    else:
        meta.loc[i,'isrc_year'] = meta.loc[i,'isrc_year']+2000
print(meta['isrc_year'].unique())

[2021 2018 2020 2022 2019 1999 2001 2000 2017 2016 2014 2012 2008 2005
 2015 2009 1982 1930 1933 1977 1936 1935 1950 1962 2004 1970 1966 2006
 2010 1996 1995 2023 2007 2011 2003 2013 2002 1924 1981 1973 1960 1971
 1965 1938 1985 1998 1963 1990 1994 1987 1980 1978 1984 1988 1976 1974
 1979 1992 1989 1991 1986 1997 1983 1951 1993 1925 1942 1926 1969 1968
 1967 1975 1958 1954 1957 1944 1964]


In [5]:
#load covers
covers = pd.read_json('D:/programs/projects/yandex_music/covers.json', lines= True)
covers.info()
display(covers.sample(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71597 entries, 0 to 71596
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   original_track_id  4821 non-null   object
 1   track_id           71597 non-null  object
 2   track_remake_type  71597 non-null  object
dtypes: object(3)
memory usage: 1.6+ MB


Unnamed: 0,original_track_id,track_id,track_remake_type
33459,,5dcc0f689345bbd21ac362048efc8c2e,COVER
28132,,4f2df62727dac46964ee1ac7b5d73202,COVER
45365,,20e1c7e496f5847ad272fed13bd96fee,COVER
32639,,a4f822c05a7aeb63eb4fa71d89992162,COVER
20533,,ff1aab2df99403622618b79cb53de4e8,COVER


In [6]:
#load lyrics
lyrics = pd.read_json('D:/programs/projects/yandex_music/lyrics.json', lines= True)
lyrics.info()
display(lyrics.sample(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11414 entries, 0 to 11413
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   lyricId   11414 non-null  object
 1   text      11414 non-null  object
 2   track_id  11414 non-null  object
dtypes: object(3)
memory usage: 267.6+ KB


Unnamed: 0,lyricId,text,track_id
8533,94ac2378097294fa5848a5765b00af92,I usually love sleeping all alone\nThis time a...,555491769c20ade34ea5ad1ebb64ad13
967,a413b6bb7149590bf14fe3989b59d93a,The world goes on fire and no one can save me ...,48c92168c92e23bb669b63af07109621
7013,db43fa3c9d6c9b1bdfe70d01f5f711a7,Let's go\nSteve walks warily down the street\n...,a0594d86cbae168e895929cd7ad81fcc
4197,69976f1d48e8cecd9fed70db811ac5fd,Boys seem to like the girls\nWho laugh at anyt...,645dcb5a07d803954b6f76305400dfdb
408,a7ce73123cb9e54d066ed28e0266f1df,И снова в одинокого я сижу на кухне\nТут так т...,3fe278e5d42ae39a1704e0a54f32b55c


In [7]:
lyrics.duplicated('track_id').sum()

1137

In [8]:
lyrics.drop_duplicates(subset=['track_id'], inplace=True)

In [9]:
lyrics.duplicated('track_id').sum()

0

In [10]:
df = pd.merge(meta, covers, how = 'outer', on="track_id").dropna(subset=['isrc'])
df =  pd.merge(df, lyrics, how = 'outer', on="track_id").dropna(subset=['track_remake_type'])
df.reset_index(drop=True, inplace=True)
df.info()
df.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71287 entries, 0 to 71286
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   track_id           71287 non-null  object        
 1   dttm               71287 non-null  datetime64[ns]
 2   title              71287 non-null  object        
 3   language           21821 non-null  object        
 4   isrc               71287 non-null  object        
 5   genres             71287 non-null  object        
 6   duration           71287 non-null  float64       
 7   isrc_year          71287 non-null  float64       
 8   original_track_id  4729 non-null   object        
 9   track_remake_type  71287 non-null  object        
 10  lyricId            10063 non-null  object        
 11  text               10063 non-null  object        
dtypes: datetime64[ns](1), float64(2), object(9)
memory usage: 6.5+ MB


Unnamed: 0,track_id,dttm,title,language,isrc,genres,duration,isrc_year,original_track_id,track_remake_type,lyricId,text
14740,10db760680100d5c7aa06ed4f8e25a50,2022-08-13 01:24:58,Buồn Của Anh,VI,VNA0M1905679,[POP],274760.0,2019.0,,COVER,,
68330,e6ba694bfb9bd10ff7270e9e993c6caa,2019-02-14 21:00:00,Can't Buy Me Love,EN,AURH01900022,[POP],203390.0,2019.0,,COVER,,
64814,1acb7b94b9dd42d0bd7cc215f467f151,2021-04-02 16:24:04,Amor e Ilusao,,QZGLS2164190,"[FOLK, LATINFOLK]",162060.0,2021.0,,COVER,,
30631,a495d886634f8e44ce045fa77046924c,2021-08-17 18:26:07,OJITOS HECHIZEROS saxocumbia,,QZDFP1932823,"[FOLK, LATINFOLK]",235440.0,2019.0,,COVER,,
67647,1906fc79a6062e2224ef72f7d8f61f67,2019-01-16 11:41:43,All of Me,,PLS921748777,[],226640.0,2017.0,,COVER,,


In [11]:
df.track_remake_type.value_counts()

track_remake_type
COVER       67083
ORIGINAL     4204
Name: count, dtype: int64

In [12]:
original1 = df[df['track_remake_type'] == 'ORIGINAL']['title'].sample()
original1

40688    Царица
Name: title, dtype: object

In [13]:
print(df['title'].astype("string"))

0              Happy New Year
1                  Bad Habits
2        Por Esa Loca Vanidad
3                Mil Lagrimas
4         Sexo Humo y Alcohol
                 ...         
71282               Chanceuse
71283      Quiero Que Vuelvas
71284                Vacation
71285         Dance Hall Days
71286                      17
Name: title, Length: 71287, dtype: string


In [14]:
display(df[df['title'].str.contains('All Night', regex=False)])

Unnamed: 0,track_id,dttm,title,language,isrc,genres,duration,isrc_year,original_track_id,track_remake_type,lyricId,text
2617,767100de5486f2af5499c1b10cf5170a,2022-01-20 21:00:00,Dance All Night,EN,ARF412200027,[ELECTRONICS],166090.0,2022.0,,COVER,,
2756,b3f3b7f554fd0966d883ba6dbe973419,2022-01-05 01:05:32,I Drove All Night,,US7VG2209437,"[FOLK, LATINFOLK]",247290.0,2022.0,,COVER,,
4019,855fdae8b7d3cd9187306d9d4781b58c,2022-01-22 00:14:26,You Shook Me All Night Long,,US7VG2299560,"[ROCK, ALLROCK]",211530.0,2022.0,,COVER,,
6344,460ebfb67e17a45919fc4f7c6a6b96de,2022-02-23 21:42:50,You Shook Me All Night Long,,QZGLM2254909,"[ROCK, ALLROCK]",211500.0,2022.0,,COVER,,
8025,30b7c445b856fcf1d2345ba2109524ac,2023-05-09 21:00:00,All Night,,UKEX32207619,[ELECTRONICS],138860.0,2022.0,30b7c445b856fcf1d2345ba2109524ac,ORIGINAL,,
10302,9e2c4398b68338facb97d504e3c0a734,2022-05-26 21:00:00,All Night Long,,RUA1D2237335,[DANCE],148030.0,2022.0,9e2c4398b68338facb97d504e3c0a734,ORIGINAL,,
18881,fee7a632b21c6e8f404ccf7bd80ffdf7,2021-12-20 21:00:00,Up All Night,EN,ZAC012000036,[POP],154000.0,2020.0,,COVER,,
21465,d5e73a91b22b6473bef6464611330b22,2021-05-20 21:00:00,Do It All Night,EN,DEHK92100181,[DANCE],137800.0,2021.0,,COVER,df1d1491c90b776a6a022050c36a00e2,"You make me burning, yes, I am burning away\nI..."
21688,400cf4254b009ea34a45ed185f5f0e11,2021-05-03 19:50:38,All Night Long (All Night),EN,BR03D2101021,[FOLK],261620.0,2021.0,,COVER,,
22055,6fb20373c0c83183cbca2140ecb1b1ca,2021-05-06 22:56:41,You Shook Me All Night Long,,AUXN22151495,[POP],200000.0,2021.0,,COVER,,


In [15]:
# Сначала создаем копию DataFrame, чтобы избежать изменения исходных данных
data = df.copy()
data.reset_index(drop=True, inplace=True)
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71287 entries, 0 to 71286
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   track_id           71287 non-null  object        
 1   dttm               71287 non-null  datetime64[ns]
 2   title              71287 non-null  object        
 3   language           21821 non-null  object        
 4   isrc               71287 non-null  object        
 5   genres             71287 non-null  object        
 6   duration           71287 non-null  float64       
 7   isrc_year          71287 non-null  float64       
 8   original_track_id  4729 non-null   object        
 9   track_remake_type  71287 non-null  object        
 10  lyricId            10063 non-null  object        
 11  text               10063 non-null  object        
dtypes: datetime64[ns](1), float64(2), object(9)
memory usage: 6.5+ MB
None


In [16]:
# Приводим все названия треков к нижнему регистру
data['title'] = data['title'].str.lower()

# Удаляем все знаки препинания и символы
data['title'] = data['title'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [17]:
# Отбираем группы треков, где 'ORIGINAL' встречается больше 1 раза
groups_orig = data[data['track_remake_type'] == 'ORIGINAL'].groupby('title').filter(lambda x: len(x) > 1)

In [18]:
# Для каждой группы отмечаем самый ранний трек как 'ORIGINAL', а остальные помечаем как 'COVER'
for name, group in groups_orig.groupby('title'):
    idx_min = group['isrc_year'].min()
    data.loc[(data['title'] == name) & (data['isrc_year'] != idx_min), 'track_remake_type'] = 'COVER'
    data.loc[(data['title'] == name) & (data['isrc_year'] == idx_min), 'track_remake_type'] = 'ORIGINAL'

In [19]:
# Снова проверим группы треков, где 'ORIGINAL' встречается больше 1 раза
groups_orig = data[data['track_remake_type'] == 'ORIGINAL'].groupby('title').filter(lambda x: len(x) > 1)

groups_orig.title.value_counts()

title
bagatelle no 25 in a minor woo 59 für elise    15
home                                           12
lonely                                         11
bright lights                                   6
blue lines                                      5
                                               ..
dont need your love                             2
deep                                            2
ocean eyes                                      2
ok                                              2
привет                                          2
Name: count, Length: 111, dtype: int64

In [20]:
data.track_remake_type.value_counts()

track_remake_type
COVER       67340
ORIGINAL     3947
Name: count, dtype: int64

In [21]:
data.dropna(subset=['track_remake_type'], inplace=True)

In [22]:
data.loc[data['track_remake_type'] == 'ORIGINAL','original'] = 1
data.loc[data['track_remake_type'] == 'COVER','original'] = 0

In [23]:
#data['track_remake_type_binary'] = data['track_remake_type_binary'].astype('int')

In [24]:
data.original.value_counts()

original
0.0    67340
1.0     3947
Name: count, dtype: int64

In [25]:
rand = 12345
train, ver = train_test_split(
    data, 
    test_size=0.20, 
    stratify = data['original'], 
    random_state = rand
)

In [26]:
x_train = train['title']
y_train = train['original']
x_ver = ver['title']
y_ver = ver['original']

print(x_train.shape)
print(y_train.shape)
print(x_ver.shape)
print(y_ver.shape)

(57029,)
(57029,)
(14258,)
(14258,)


In [27]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained("bert-base-multilingual-cased")
text = x_train[1]
print(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

bad habits


In [28]:
print(encoded_input)
print(output)

{'input_ids': tensor([[  101, 15838, 16266, 15508,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0193, -0.1513,  0.4090,  ...,  0.3713,  0.0009,  0.2770],
         [-0.2842, -0.4679,  0.4050,  ...,  0.6699, -0.1677,  0.1406],
         [-0.3579, -0.6110,  0.4879,  ...,  0.1633, -0.3189,  0.1305],
         [-0.4342, -0.4542,  0.6964,  ...,  0.4498, -0.2160,  0.1762],
         [-0.1987, -0.3481,  0.7416,  ...,  0.2343, -0.1392,  0.5558]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 0.1366, -0.1078,  0.2450, -0.0442,  0.0606,  0.2532,  0.1569,  0.1669,
         -0.4168,  0.1990, -0.0454, -0.1328, -0.1071, -0.0020,  0.1884,  0.0455,
          0.3752,  0.0175,  0.2426, -0.4124, -0.9998, -0.4051, -0.1132, -0.2569,
         -0.1622,  0.2590, -0.2537,  0.2243,  0.2616, -0.2914,  0.1092, -0.9998,
          0.6314,  0.3483,  0.1250, -0.2849, -0

In [29]:
tokenized = x_train.apply(
    lambda x: tokenizer.encode(x, add_special_tokens=True))

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])

attention_mask = np.where(padded != 0, 1, 0)

In [31]:
batch_size = 1
embeddings = []
for i in notebook.tqdm(range(padded.shape[0] // batch_size)):
    batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)]) 
    attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])
        
    with torch.no_grad():
        batch_embeddings = model(batch, attention_mask=attention_mask_batch)
       
    embeddings.append(batch_embeddings[0][:,0,:].numpy())

  0%|          | 0/57029 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [32]:
import tensorflow as tf
tf.test.is_built_with_cuda()

False

In [None]:
features_train = np.concatenate(embeddings)

In [None]:
display(tokenized)
display(features_train)

In [None]:
# save numpy array as csv file
from numpy import asarray
from numpy import savetxt
# define data
# save to csv file
savetxt('D:/programs/projects/yandex_music/features_train.csv', features_train, delimiter=',')
np.save('D:/programs/projects/yandex_music/features_train.npy', features_train)

In [None]:
features_train

In [37]:
logreg = LogisticRegression(solver='lbfgs',max_iter=10000,class_weight = 'balanced')
logreg.fit(features, y_train)

NameError: name 'features' is not defined

In [None]:
tokenized = x_ver.apply(
    lambda x: tokenizer.encode(x, add_special_tokens=True))

padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])

attention_mask = np.where(padded != 0, 1, 0)

In [None]:
tokenized.info()

In [None]:
batch_size = 1
embeddings = []
for i in notebook.tqdm(range(padded.shape[0] // batch_size)):
        batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)]) 
        attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])
        
        with torch.no_grad():
            batch_embeddings = model(batch, attention_mask=attention_mask_batch)
        
        embeddings.append(batch_embeddings[0][:,0,:].numpy())

In [None]:
features_ver = np.concatenate(embeddings)

In [None]:
len(features_ver)

In [None]:
predictions = logreg.predict(features_ver)
print(metrics.precision_score(y_ver, predictions))

In [None]:
log_predict_proba = cross_val_predict(logreg, features_ver, y_ver, cv=2, method='predict_proba')
print(log_predict_proba)

In [None]:
from matplotlib import pyplot as plt


log_predict_proba1 = log_predict_proba[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_ver, log_predict_proba1)
auc = metrics.roc_auc_score(y_ver, log_predict_proba1)
#create ROC curve
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.title('ROC-AUC')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()

In [None]:
precision_score = 0.0
threshold = 0.1
for thresh in range(1, 10):
    predict_cv_proba = (log_predict_proba1 >= thresh/10).astype(bool) 
    print(metrics.precision_score(y_ver, predict_cv_proba))
    if precision_score<metrics.precision_score(y_ver, predict_cv_proba):
        threshold = thresh/10
        precision_score = metrics.precision_score(y_ver, predict_cv_proba)
print(threshold)

In [None]:
predict_log = logreg.predict(features_ver)
print(predict_log)
print(predict_log.sum())
print(y_ver.sum())
print(metrics.precision_score(y_ver, predict_log))

In [None]:
predict_final = (logreg.predict_proba(features_ver)[:,1] >= threshold).astype(bool) 
print(predict_final)
print(predict_final.sum())
print(y_ver.sum())
print(metrics.precision_score(y_ver, predict_final))