# Предобработка данных

In [1]:
import re
import json
import pandas as pd
import numpy as np
import seaborn as sns
import spacy
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from catboost import CatBoostClassifier
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
#from polyglot.text import Text
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from scipy.stats import pointbiserialr, chi2_contingency
import fasttext

import warnings
warnings.filterwarnings("ignore")



In [2]:
covers = pd.read_json('covers.json', lines=True)

In [3]:
meta = pd.read_json('meta.json', lines=True, convert_dates=['dttm'])

In [4]:
lyrics = pd.read_json('lyrics.json', lines=True)

Изучим наши датафреймы.

In [5]:
covers.head(10)

Unnamed: 0,original_track_id,track_id,track_remake_type
0,eeb69a3cb92300456b6a5f4162093851,eeb69a3cb92300456b6a5f4162093851,ORIGINAL
1,fe7ee8fc1959cc7214fa21c4840dff0a,fe7ee8fc1959cc7214fa21c4840dff0a,ORIGINAL
2,cd89fef7ffdd490db800357f47722b20,cd89fef7ffdd490db800357f47722b20,ORIGINAL
3,995665640dc319973d3173a74a03860c,995665640dc319973d3173a74a03860c,ORIGINAL
4,,d6288499d0083cc34e60a077b7c4b3e1,COVER
5,,4da9d7b6d119db4d2d564a2197798380,COVER
6,,2bf283c05b601f21364d052ca0ec798d,COVER
7,,5eb13cb69b6e20dd7a42030f5936a9dc,COVER
8,,109f91266ef89cc3690079b28abfe9a3,COVER
9,76c538125fc5c9ec6ad1d05650a57de5,76c538125fc5c9ec6ad1d05650a57de5,ORIGINAL


In [6]:
covers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71597 entries, 0 to 71596
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   original_track_id  4821 non-null   object
 1   track_id           71597 non-null  object
 2   track_remake_type  71597 non-null  object
dtypes: object(3)
memory usage: 1.6+ MB


In [7]:
covers.describe()

Unnamed: 0,original_track_id,track_id,track_remake_type
count,4821,71597,71597
unique,4468,71597,2
top,f3e5e9db509f9b6f6d93b9c38e4e1324,eeb69a3cb92300456b6a5f4162093851,COVER
freq,38,1,67304


In [8]:
covers.duplicated().sum()

0

Посмотрим на дисбаланс классов.

In [9]:
covers.track_remake_type.value_counts()

COVER       67304
ORIGINAL     4293
Name: track_remake_type, dtype: int64

Посмотрим на кол-во записей, где original_track_id и track_id совпадают.

In [10]:
len(covers[covers['original_track_id'] == covers['track_id']])

4293

Кол-во записей совпадают, перейдем к другим датасетам.

In [11]:
meta.head(10)

Unnamed: 0,track_id,dttm,title,language,isrc,genres,duration
0,c3b9d6a354ca008aa4518329aaa21380,2021-12-16 21:00:00,Happy New Year,EN,RUB422103970,[DANCE],161120.0
1,c57e3d13bbbf5322584a7e92e6f1f7ff,2021-11-24 14:00:33,Bad Habits,EN,QZN882178276,[ELECTRONICS],362260.0
2,955f2aafe8717908c140bf122ba4172d,2021-11-24 15:37:03,Por Esa Loca Vanidad,,QZNJZ2122549,"[FOLK, LATINFOLK]",260000.0
3,fae5a077c9956045955dde02143bd8ff,2021-11-24 15:37:03,Mil Lagrimas,,QZNJZ2166033,"[FOLK, LATINFOLK]",190000.0
4,6bede082154d34fc18d9a6744bc95bf5,2021-11-24 15:37:03,Sexo Humo y Alcohol,,QZNJZ2122551,"[FOLK, LATINFOLK]",203000.0
5,551331ee1444de6658e71933eb7f57a2,2021-11-24 15:37:03,La Ultima Vez,,QZNJZ2166034,"[FOLK, LATINFOLK]",173000.0
6,2be8bb01bb1aeef0908524dc11935a88,2021-11-24 15:37:03,Homenaje a Pastor de los Santos,,QZNJZ2122553,"[FOLK, LATINFOLK]",192100.0
7,3883d1add3ff8b60e77548a64df7c892,2021-11-24 15:37:03,Los Marginales,,QZNJZ2122554,"[FOLK, LATINFOLK]",311500.0
8,59aa886979699aa7e4e2a8aed43bfe03,2021-11-24 15:37:03,Cuenta Conmigo,,QZNJZ2122555,"[FOLK, LATINFOLK]",250000.0
9,2a38992b7769963937246288ad89b44c,2021-11-24 15:37:03,Ahora Juego Yo,,QZNJZ2122556,"[FOLK, LATINFOLK]",156000.0


In [12]:
meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71769 entries, 0 to 71768
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   track_id  71768 non-null  object        
 1   dttm      71768 non-null  datetime64[ns]
 2   title     71768 non-null  object        
 3   language  21969 non-null  object        
 4   isrc      71455 non-null  object        
 5   genres    71768 non-null  object        
 6   duration  71768 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(5)
memory usage: 3.8+ MB


In [13]:
meta.describe(include='all')

Unnamed: 0,track_id,dttm,title,language,isrc,genres,duration
count,71768,71768,71768,21969,71455,71768,71768.0
unique,71768,27726,45462,85,71283,232,
top,c3b9d6a354ca008aa4518329aaa21380,2013-03-11 20:00:00,Morena,EN,USTCF1018560,"[FOLK, LATINFOLK]",
freq,1,839,81,15322,3,18391,
first,,2009-08-10 17:32:06,,,,,
last,,2023-10-18 21:00:00,,,,,
mean,,,,,,,204918.7
std,,,,,,,85598.54
min,,,,,,,0.0
25%,,,,,,,161370.0


In [14]:
meta_exploded = meta.explode("genres")
meta_exploded.duplicated().sum()

0

In [15]:
lyrics.head(10)

Unnamed: 0,lyricId,text,track_id
0,a951f9504e89759e9d23039b7b17ec14,"Живу сейчас обломами, обломками не той любви\n...",1c4b1230f937e4c548ff732523214dcd
1,0c749bc3f01eb8e6cf986fa14ccfc585,Tell me your fable\nA fable\nTell me your fabl...,0faea89b0d7d6235b5b74def72511bd8
2,e2c8830fbc86e5964478243099eec23a,You're ashamed about all your fears and doubts...,9c6dc41d5ccd9968d07f055da5d8f741
3,e2c8830fbc86e5964478243099eec23a,You're ashamed about all your fears and doubts...,bfd04a73e9cffdf0e282c92219a86ea1
4,7624653ca8522ba93470843c74961b7d,"You showed him all the best of you,\nBut I'm a...",8d70930d09cd239c948408d1317d8659
5,7624653ca8522ba93470843c74961b7d,"You showed him all the best of you,\nBut I'm a...",b12ecdeac6ac42e1ecac96f9f15b1159
6,aaf5c8771a94127de56c08623393434a,You lost her\nAnd now you're wishing you could...,8ef2d56d4dea12988133f6d933401733
7,aaf5c8771a94127de56c08623393434a,You lost her\nAnd now you're wishing you could...,4f41809cdbbb3d3dc99491d1273e7519
8,28d69bf999a2d26c5a6365b71d07a706,If you go now\nPlease leave the light on\nJust...,c8658fc634c181f05c882e1919e157df
9,b6b9656527783d76fcfa0fad73625f36,Justo ayer me di cuenta\nQue solo es cuestión ...,682f310c802690104b3a633b6557bac2


In [16]:
lyrics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11414 entries, 0 to 11413
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   lyricId   11414 non-null  object
 1   text      11414 non-null  object
 2   track_id  11414 non-null  object
dtypes: object(3)
memory usage: 267.6+ KB


In [17]:
lyrics.describe()

Unnamed: 0,lyricId,text,track_id
count,11414,11414,11414
unique,10915,10644,10277
top,28fbafaf626c497d0d4d933b4f0d3f5b,Twenty-five years and my life is still\nI'm tr...,b891000882f4a612190142861b5c038b
freq,9,15,24


In [18]:
lyrics.duplicated().sum()

0

In [19]:
duplicated_track_id = lyrics[lyrics.duplicated(['track_id', 'text'])]
duplicated_track_id.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 264 entries, 848 to 11412
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   lyricId   264 non-null    object
 1   text      264 non-null    object
 2   track_id  264 non-null    object
dtypes: object(3)
memory usage: 8.2+ KB


Мы видим, что у нас много дубликатов по столбцам 'track_id' и 'text', избавимся от них.

In [20]:
lyrics.drop_duplicates(subset=['track_id', 'text'], inplace=True)

In [21]:
lyrics.describe()

Unnamed: 0,lyricId,text,track_id
count,11150,11150,11150
unique,10688,10644,10277
top,6fe05ce48761f833737487cd7a499cd3,"A and X, El Orfanato\nDanza Kuduro\nLucenzo El...",adc126c946551da4e1784e716b77992e
freq,9,11,10


Теперь посмотрим на кол-во дубликатов по столбцу *track_id*, затем удалим и их тоже.

In [22]:
duplicated_track_id = lyrics[lyrics.duplicated(['track_id'])]
duplicated_track_id.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 873 entries, 263 to 11411
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   lyricId   873 non-null    object
 1   text      873 non-null    object
 2   track_id  873 non-null    object
dtypes: object(3)
memory usage: 27.3+ KB


In [23]:
lyrics.drop_duplicates(subset=['track_id'], inplace=True)

In [24]:
lyrics.describe()

Unnamed: 0,lyricId,text,track_id
count,10277,10277,10277
unique,9891,9852,10277
top,6fe05ce48761f833737487cd7a499cd3,"A and X, El Orfanato\nDanza Kuduro\nLucenzo El...",1c4b1230f937e4c548ff732523214dcd
freq,9,9,1


Объединим наши таблицы.

In [25]:
df = meta.merge(covers, on='track_id', how='left')

In [26]:
df = df.merge(lyrics, on='track_id', how='left')

In [27]:
df.head(20)

Unnamed: 0,track_id,dttm,title,language,isrc,genres,duration,original_track_id,track_remake_type,lyricId,text
0,c3b9d6a354ca008aa4518329aaa21380,2021-12-16 21:00:00,Happy New Year,EN,RUB422103970,[DANCE],161120.0,,COVER,36777cce0740d1e3d37a5a221c71d859,No more champagne\nAnd the fireworks are throu...
1,c57e3d13bbbf5322584a7e92e6f1f7ff,2021-11-24 14:00:33,Bad Habits,EN,QZN882178276,[ELECTRONICS],362260.0,,COVER,8284927611c6a6fc49ff1ce7b3a88314,"Ooh-ooh, ooh, ooh\n\nEvery time you come aroun..."
2,955f2aafe8717908c140bf122ba4172d,2021-11-24 15:37:03,Por Esa Loca Vanidad,,QZNJZ2122549,"[FOLK, LATINFOLK]",260000.0,,COVER,,
3,fae5a077c9956045955dde02143bd8ff,2021-11-24 15:37:03,Mil Lagrimas,,QZNJZ2166033,"[FOLK, LATINFOLK]",190000.0,,COVER,,
4,6bede082154d34fc18d9a6744bc95bf5,2021-11-24 15:37:03,Sexo Humo y Alcohol,,QZNJZ2122551,"[FOLK, LATINFOLK]",203000.0,,COVER,,
5,551331ee1444de6658e71933eb7f57a2,2021-11-24 15:37:03,La Ultima Vez,,QZNJZ2166034,"[FOLK, LATINFOLK]",173000.0,,COVER,,
6,2be8bb01bb1aeef0908524dc11935a88,2021-11-24 15:37:03,Homenaje a Pastor de los Santos,,QZNJZ2122553,"[FOLK, LATINFOLK]",192100.0,,COVER,,
7,3883d1add3ff8b60e77548a64df7c892,2021-11-24 15:37:03,Los Marginales,,QZNJZ2122554,"[FOLK, LATINFOLK]",311500.0,,COVER,,
8,59aa886979699aa7e4e2a8aed43bfe03,2021-11-24 15:37:03,Cuenta Conmigo,,QZNJZ2122555,"[FOLK, LATINFOLK]",250000.0,,COVER,,
9,2a38992b7769963937246288ad89b44c,2021-11-24 15:37:03,Ahora Juego Yo,,QZNJZ2122556,"[FOLK, LATINFOLK]",156000.0,,COVER,,


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71769 entries, 0 to 71768
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   track_id           71768 non-null  object        
 1   dttm               71768 non-null  datetime64[ns]
 2   title              71768 non-null  object        
 3   language           21969 non-null  object        
 4   isrc               71455 non-null  object        
 5   genres             71768 non-null  object        
 6   duration           71768 non-null  float64       
 7   original_track_id  4821 non-null   object        
 8   track_remake_type  71597 non-null  object        
 9   lyricId            10277 non-null  object        
 10  text               10277 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(9)
memory usage: 6.6+ MB


In [29]:
df.describe(include='all')

Unnamed: 0,track_id,dttm,title,language,isrc,genres,duration,original_track_id,track_remake_type,lyricId,text
count,71768,71768,71768,21969,71455,71768,71768.0,4821,71597,10277,10277
unique,71768,27726,45462,85,71283,232,,4468,2,9891,9852
top,c3b9d6a354ca008aa4518329aaa21380,2013-03-11 20:00:00,Morena,EN,USTCF1018560,"[FOLK, LATINFOLK]",,f3e5e9db509f9b6f6d93b9c38e4e1324,COVER,6fe05ce48761f833737487cd7a499cd3,"A and X, El Orfanato\nDanza Kuduro\nLucenzo El..."
freq,1,839,81,15322,3,18391,,38,67304,9,9
first,,2009-08-10 17:32:06,,,,,,,,,
last,,2023-10-18 21:00:00,,,,,,,,,
mean,,,,,,,204918.7,,,,
std,,,,,,,85598.54,,,,
min,,,,,,,0.0,,,,
25%,,,,,,,161370.0,,,,


Удалим пропуски в столбце isrc.

In [30]:
df.dropna(subset=['isrc'], inplace=True)

Посмотрим на баланс классов еще раз.

In [31]:
df.track_remake_type.value_counts()

COVER       67083
ORIGINAL     4204
Name: track_remake_type, dtype: int64

In [32]:
len(df.loc[df['track_id'] == df['original_track_id']])

4204

Создадим новый признак - год появения трека в базе isrc. Напишем функцию, которая будет возвращать нам год из 6-го и 7-го символов isrc-кода.

In [33]:
def convert(x):
    try:
        return '19' + x[5:7] if int(x[5:7]) > 23 else '20' + x[5:7]
    except ValueError:
        return None

In [34]:
df['year'] = df['isrc'].apply(convert)

In [35]:
df.dropna(subset=['year'], inplace=True)

Мы знаем, что у треков с одинаковым названием может быть несколько оригиналов в track_remake_type. Например, трек Smooth Criminal.

In [36]:
df.loc[(df['title'] == 'Smooth Criminal') & (df['track_remake_type'] == 'ORIGINAL')]

Unnamed: 0,track_id,dttm,title,language,isrc,genres,duration,original_track_id,track_remake_type,lyricId,text,year
2727,ecb6d083492582beb335a82473416c16,2022-01-04 17:25:32,Smooth Criminal,EN,QZJG52207758,[DANCE],149070.0,ecb6d083492582beb335a82473416c16,ORIGINAL,,,2022
2945,bc14fcffe5e8f9f4e6d7aedf8778e001,2022-01-07 05:30:21,Smooth Criminal,EN,QZFYY2242806,[RAP],137670.0,bc14fcffe5e8f9f4e6d7aedf8778e001,ORIGINAL,c21e223482499b573d5c24d78c82ff1a,"No, no, no, whoah\nMmmmm, no, no, no\nWhoah\n\...",2022
7129,347a1dbfa9d4146b098a537926c8c9bb,2022-03-06 23:38:29,Smooth Criminal,EN,QZDA81962660,[RAP],152860.0,347a1dbfa9d4146b098a537926c8c9bb,ORIGINAL,,,2019
8931,0af163b20807c7559c4ce168d5bf194d,2022-05-10 21:00:00,Smooth Criminal,EN,TCAGD2292399,[RAP],140350.0,0af163b20807c7559c4ce168d5bf194d,ORIGINAL,,,2022
9059,dfdeea64f0e60e66cba2ce7f7043ab45,2022-04-14 08:50:48,Smooth Criminal,EN,TCAGD2286692,[RAP],199540.0,dfdeea64f0e60e66cba2ce7f7043ab45,ORIGINAL,,,2022
9780,438dc9c22a6a34ea7af369fd95983dbb,2022-05-02 12:31:53,Smooth Criminal,,AEA2M2256036,[RAP],107420.0,438dc9c22a6a34ea7af369fd95983dbb,ORIGINAL,,,2022
13456,a86e987df0929f1d898f7b960fce8df9,2022-07-24 10:31:35,Smooth Criminal,,AEA2L2270107,[RAP],174000.0,a86e987df0929f1d898f7b960fce8df9,ORIGINAL,,,2022
16758,7ece1d8f67b3d164f188c970d2144be6,2019-12-05 21:00:00,Smooth Criminal,,TCAEM1946722,[RAP],131200.0,7ece1d8f67b3d164f188c970d2144be6,ORIGINAL,0a06fb0741691c03d111032e84f1498c,As he came into the window\nIt was the sound o...,2019
17674,225404f4a0f6b181a915bc030d1d477a,2020-01-30 21:00:00,Smooth Criminal,EN,DELJ82075354,"[FOREIGNRAP, RAP]",139690.0,225404f4a0f6b181a915bc030d1d477a,ORIGINAL,,,2020
17894,eb4af6e78bb0b0729af9d66c110c439c,2020-02-13 21:00:00,Smooth Criminal,NL,BEB472000016,[RAP],154400.0,eb4af6e78bb0b0729af9d66c110c439c,ORIGINAL,,,2020


Я считаю это выбросами и предлагаю их переметить, выбрав самый ранний трек как оригинал, остальные отметим как 'COVER'.

In [96]:
# Сначала создаем копию DataFrame, чтобы избежать изменения исходных данных
df_copy = df.copy()

In [97]:
df_copy.reset_index(drop=True, inplace=True)

In [98]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71454 entries, 0 to 71453
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   track_id           71454 non-null  object        
 1   dttm               71454 non-null  datetime64[ns]
 2   title              71454 non-null  object        
 3   language           21964 non-null  object        
 4   isrc               71454 non-null  object        
 5   genres             71454 non-null  object        
 6   duration           71454 non-null  float64       
 7   original_track_id  4728 non-null   object        
 8   track_remake_type  71286 non-null  object        
 9   lyricId            10214 non-null  object        
 10  text               10214 non-null  object        
 11  year               71454 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(10)
memory usage: 6.5+ MB


In [99]:
# Приводим все названия треков к нижнему регистру
df_copy['title'] = df_copy['title'].str.lower()

# Удаляем все знаки препинания и символы
df_copy['title'] = df_copy['title'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [100]:
df_copy.title.value_counts()

smooth criminal         84
morena                  81
rolê                    72
wicked game             68
esquema preferido       66
                        ..
i wanna love you         1
moving mountains         1
chill with you           1
you exist in my song     1
17                       1
Name: title, Length: 43281, dtype: int64

In [101]:
# Отбираем группы треков, где 'ORIGINAL' встречается больше 1 раза
groups_orig = df_copy[df_copy['track_remake_type'] == 'ORIGINAL'].groupby('title').filter(lambda x: len(x) > 1)

In [102]:
groups_orig.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 749 entries, 152 to 71437
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   track_id           749 non-null    object        
 1   dttm               749 non-null    datetime64[ns]
 2   title              749 non-null    object        
 3   language           427 non-null    object        
 4   isrc               749 non-null    object        
 5   genres             749 non-null    object        
 6   duration           749 non-null    float64       
 7   original_track_id  749 non-null    object        
 8   track_remake_type  749 non-null    object        
 9   lyricId            428 non-null    object        
 10  text               428 non-null    object        
 11  year               749 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(10)
memory usage: 76.1+ KB


In [103]:
# Для каждой группы отмечаем самый ранний трек как 'ORIGINAL', а остальные помечаем как 'COVER'
for name, group in groups_orig.groupby('title'):
    idx_min = group['year'].min()
    df_copy.loc[(df_copy['title'] == name) & (df_copy['year'] != idx_min), 'track_remake_type'] = 'COVER'
    df_copy.loc[(df_copy['title'] == name) & (df_copy['year'] == idx_min), 'track_remake_type'] = 'ORIGINAL'

In [106]:
# Снова проверим группы треков, где 'ORIGINAL' встречается больше 1 раза
groups_orig = df_copy[df_copy['track_remake_type'] == 'ORIGINAL'].groupby('title').filter(lambda x: len(x) > 1)

groups_orig.title.value_counts()

bagatelle no 25 in a minor woo 59 für elise    15
home                                           12
lonely                                         11
bright lights                                   6
blue lines                                      5
                                               ..
dont need your love                             2
deep                                            2
ocean eyes                                      2
ok                                              2
привет                                          2
Name: title, Length: 111, dtype: int64

У нас остались такие треки, видимо, они были выпущены в одном году и имеют одно название, просто выпускались на разных релизах, оставим как есть. Снова проверим баланс классов.

Теперь повторим для групп треков, размеченных как COVER.

In [123]:
# Отбираем группы треков, где все треки размечены как 'COVER' и встречаются больше 1 раза
groups_cover = df_copy[df_copy['track_remake_type'] == 'COVER'].groupby('title').filter(lambda x: len(x) > 1)

# Для каждой группы отмечаем самый ранний трек как 'ORIGINAL', а остальные помечаем как 'COVER'
for name, group in groups_cover.groupby('title'):
    idx_min = group['year'].min()
    df_copy.loc[(df['title'] == name) & (df_copy['year'] != idx_min), 'track_remake_type'] = 'COVER'
    df_copy.loc[(df['title'] == name) & (df_copy['year'] == idx_min), 'track_remake_type'] = 'ORIGINAL'

In [124]:
df_copy.track_remake_type.value_counts()

COVER       67275
ORIGINAL     4014
Name: track_remake_type, dtype: int64

In [126]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71454 entries, 0 to 71453
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   track_id                  71454 non-null  object        
 1   dttm                      71454 non-null  datetime64[ns]
 2   title                     71454 non-null  object        
 3   language                  21964 non-null  object        
 4   isrc                      71454 non-null  object        
 5   genres                    71454 non-null  object        
 6   duration                  71454 non-null  float64       
 7   original_track_id         4728 non-null   object        
 8   track_remake_type         71289 non-null  object        
 9   lyricId                   10214 non-null  object        
 10  text                      10214 non-null  object        
 11  year                      71454 non-null  object        
 12  track_remake_type_

У нас образовались пропуски в track_remake_type, удалим их.

In [130]:
df_copy.dropna(subset=['track_remake_type'], inplace=True)

Поскольку мы планируем решать задачу бинарной классификации (модель должна определить, кавер перед ней или нет), то переведем значения столбца track_remake_type в бинарную систему, где ORIGINAL будет 1, а COVER будет 0.

In [131]:
# Создаем словарь для преобразования
remake_type_dict = {'ORIGINAL': 1, 'COVER': 0}

# Применяем словарь к столбцу 'track_remake_type'
df_copy['track_remake_type_binary'] = df_copy['track_remake_type'].map(remake_type_dict)

In [132]:
df_copy['track_remake_type_binary'] = df_copy['track_remake_type_binary'].astype('int')

In [133]:
df_copy.head()

Unnamed: 0,track_id,dttm,title,language,isrc,genres,duration,original_track_id,track_remake_type,lyricId,text,year,track_remake_type_binary
0,c3b9d6a354ca008aa4518329aaa21380,2021-12-16 21:00:00,happy new year,EN,RUB422103970,[DANCE],161120.0,,COVER,36777cce0740d1e3d37a5a221c71d859,No more champagne\nAnd the fireworks are throu...,2021,0
1,c57e3d13bbbf5322584a7e92e6f1f7ff,2021-11-24 14:00:33,bad habits,EN,QZN882178276,[ELECTRONICS],362260.0,,COVER,8284927611c6a6fc49ff1ce7b3a88314,"Ooh-ooh, ooh, ooh\n\nEvery time you come aroun...",2021,0
2,955f2aafe8717908c140bf122ba4172d,2021-11-24 15:37:03,por esa loca vanidad,,QZNJZ2122549,"[FOLK, LATINFOLK]",260000.0,,COVER,,,2021,0
3,fae5a077c9956045955dde02143bd8ff,2021-11-24 15:37:03,mil lagrimas,,QZNJZ2166033,"[FOLK, LATINFOLK]",190000.0,,COVER,,,2021,0
4,6bede082154d34fc18d9a6744bc95bf5,2021-11-24 15:37:03,sexo humo y alcohol,,QZNJZ2122551,"[FOLK, LATINFOLK]",203000.0,,COVER,,,2021,0


In [134]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71289 entries, 0 to 71453
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   track_id                  71289 non-null  object        
 1   dttm                      71289 non-null  datetime64[ns]
 2   title                     71289 non-null  object        
 3   language                  21824 non-null  object        
 4   isrc                      71289 non-null  object        
 5   genres                    71289 non-null  object        
 6   duration                  71289 non-null  float64       
 7   original_track_id         4728 non-null   object        
 8   track_remake_type         71289 non-null  object        
 9   lyricId                   10065 non-null  object        
 10  text                      10065 non-null  object        
 11  year                      71289 non-null  object        
 12  track_remake_type_

In [136]:
df_copy.track_remake_type_binary.value_counts()

0    67275
1     4014
Name: track_remake_type_binary, dtype: int64

# Обучение модели

In [None]:
#Замена пропущенных значений на пустые строки
#df_copy['text'].fillna('', inplace=True)

model = SentenceTransformer('LaBSE')

# Применяем модель к столбцам 'title' и 'text'
df_copy['title_vector'] = df_copy['title'].apply(model.encode)
#df_copy['text_vector'] = df_copy['text'].apply(model.encode)