### Загрузка датасета

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('dataset.csv')
df.dropna(inplace=True)
df.rename(columns={df.columns[0]: 'id'}, inplace=True) # первый столбец - id
pd.set_option('display.max_columns', None) # отображение всех колонок
target = 'popularity'
print(f"Размер датасета (очищенный): {df.shape}")
df.head(3)

Размер датасета (очищенный): (113999, 21)


Unnamed: 0,id,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic


### Реализация пайплайна проверки новых признаков (автоматизация)

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import root_mean_squared_error, r2_score

class FeatureTesterPipeline:
    def __init__(self, X_train, X_test, y_train, y_test, num, cat, base_r2=None, base_rmse=None):
        self.X_train = X_train.copy()
        self.X_test = X_test.copy()
        self.y_train = y_train
        self.y_test = y_test
        self.num = num
        self.cat = cat
        if base_r2 is None and base_rmse is None:
            self.base_r2, self.base_rmse = self.test_simple_model()
        else:
            self.base_r2 = base_r2
            self.base_rmse = base_rmse
        self.results = []

    def reset_results(self):
        self.results = []

    def test_simple_model(self, model=LinearRegression):
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), self.num),
                ('cat', OneHotEncoder(handle_unknown='ignore'), self.cat)
            ])
        
        pipeline = Pipeline(steps=[
            ('pre', preprocessor),
            ('model', model())
        ])
        
        pipeline.fit(self.X_train, self.y_train)
        pred = pipeline.predict(self.X_test)
        
        r2 = r2_score(y_test, pred)
        rmse = root_mean_squared_error(y_test, pred)

        return r2, rmse
    
    def test_feature(self, feature_func, feature_name, feature_type):
        self.X_train[feature_name] = feature_func(self.X_train)
        self.X_test[feature_name] = feature_func(self.X_test)

        if feature_type == "num":
            pipeline = self._create_pipeline(self.num + [feature_name], self.cat)
        elif feature_type == "cat":
            pipeline = self._create_pipeline(self.num, self.cat + [feature_name])

        all_features = self.num + [feature_name] + self.cat
        r2, rmse = self._evaluate_pipeline(pipeline, all_features)
        corr = self.X_train[feature_name].corr(self.y_train)

        self.results.append({
            'type': feature_type,
            'feature': feature_name,
            'r2_test': r2,
            'r2_improvement': r2 - self.base_r2,
            'rmse_test': rmse,
            'rmse_improvement': self.base_rmse - rmse,
            'correlation': corr
        })
        
        # удаляем признак для следующего теста
        self.X_train.drop(feature_name, axis=1, inplace=True)
        self.X_test.drop(feature_name, axis=1, inplace=True)
        
        return r2, rmse
    
    def _create_pipeline(self, num, cat):
        transformers = [('num', StandardScaler(), num),
                        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat)]            
        preprocessor = ColumnTransformer(transformers=transformers)
        pipeline = Pipeline(steps=[
            ('pre', preprocessor),
            ('model', LinearRegression())
        ])
        
        return pipeline
    
    def _evaluate_pipeline(self, pipeline, features):
        pipeline.fit(self.X_train[features], self.y_train)
        predictions = pipeline.predict(self.X_test[features])
        
        r2 = r2_score(self.y_test, predictions)
        rmse = root_mean_squared_error(self.y_test, predictions)
        
        return r2, rmse
    
    def show_results(self):
        return pd.DataFrame(self.results).sort_values('r2_test', ascending=False)

### Добавление первых признаков

In [3]:
genre_groups = {
    'mainstream': [
        'pop', 'k-pop', 'dance', 'hip-hop', 'r-n-b',
        'edm', 'pop-film', 'indie-pop'
    ],
    'rock_metal': [
        'rock', 'metal', 'hard-rock', 'punk',
        'alternative', 'alt-rock', 'grunge',
        'heavy-metal', 'psych-rock', 'emo'
    ],
    'electronic': [
        'electronic', 'techno', 'house', 'trance',
        'dubstep', 'drum-and-bass', 'deep-house',
        'techno', 'electro', 'hardstyle'
    ],
    'chill': [
        'chill', 'ambient', 'acoustic', 'jazz',
        'blues', 'folk', 'singer-songwriter',
        'classical', 'piano', 'study', 'sleep'
    ],
    'world': [
        'latin', 'reggae', 'salsa', 'samba',
        'world-music', 'afrobeat', 'funk', 'disco',
        'country', 'bluegrass', 'tango'
    ],
    'extreme': [
        'metalcore', 'death-metal', 'black-metal',
        'hardcore', 'grindcore', 'industrial'
    ],
    'soundtrack': [
        'anime', 'disney', 'soundtrack', 'game',
        'comedy', 'children', 'kids', 'show-tunes',
        'j-pop', 'j-rock', 'k-pop', 'cantopop'
    ]
}

def map_genre_group(genre):
    for group, genres in genre_groups.items():
        if genre in genres:
            return group
    return 'other'

df['music_type'] = df['track_genre'].apply(map_genre_group) # добавляем тип музыки (объединение жанров)
df['duration_min'] = df['duration_ms'] / 1000 / 60 # преобразуем длительность песни в минуты
df['artist_count'] = df['artists'].str.split(';').str.len() # считаем количество авторов

### Проверка базовой модели

In [4]:
from sklearn.model_selection import train_test_split

cat = ['explicit', 'key', 'mode', 'time_signature', 'music_type']
num = ['danceability', 'energy', 'loudness', 'speechiness', 
       'acousticness', 'instrumentalness', 'liveness', 
       'valence', 'tempo', 'duration_min', 'artist_count']

X = df[num + cat]
y = df['popularity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tester = FeatureTesterPipeline(X_train, X_test, y_train, y_test, num, cat)
base_r2, base_rmse = tester.test_simple_model(LinearRegression)
print(f"Базовый R^2: {base_r2:.4f}")
print(f"Базовый RMSE: {base_rmse:.4f}")

Базовый R^2: 0.0471
Базовый RMSE: 21.7416


### Проверка важности признаков на дереве решений (используя Label Encoding)

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor

X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

label_encoders = {}

for col in X_train_encoded.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_train_encoded[col] = le.fit_transform(X_train_encoded[col].astype(str))
    X_test_encoded[col] = le.transform(X_test_encoded[col].astype(str))
    label_encoders[col] = le

tree = DecisionTreeRegressor(max_depth=3, random_state=42)
tree.fit(X_train_encoded, y_train)

importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': tree.feature_importances_
}).sort_values('importance', ascending=False)

importance.head(10)

Unnamed: 0,feature,importance
15,music_type,0.389988
5,instrumentalness,0.287862
4,acousticness,0.218249
9,duration_min,0.048913
7,valence,0.029691
2,loudness,0.025296
3,speechiness,0.0
1,energy,0.0
6,liveness,0.0
0,danceability,0.0


### Тестирование новых признаков по отдельности (первый этап)

In [6]:
features = [
    # (функция, название, тип)
    
    # пороговые признаки
    (lambda df: (df['energy'] > 0.8).astype(int), 'is_high_energy', 'num'),
    (lambda df: (df['danceability'] > 0.7).astype(int), 'is_very_danceable', 'num'),
    (lambda df: (df['valence'] > 0.7).astype(int), 'is_happy', 'num'),
    (lambda df: (df['acousticness'] > 0.7).astype(int), 'is_acoustic', 'num'), # топ-5
    (lambda df: (df['instrumentalness'] > 0.5).astype(int), 'is_instrumental', 'num'),
    (lambda df: (df['speechiness'] > 0.3).astype(int), 'is_speech_heavy', 'num'),
    (lambda df: (df['artist_count'] > 1).astype(int), 'is_collaboration', 'num'),
    (lambda df: (df['artist_count'] > 2).astype(int), 'is_super_collab', 'num'),
    (lambda df: (df['artist_count'] == 1).astype(int), 'is_solo', 'num'),

    # разделение на короткие/долгие песни
    (lambda df: (df['duration_min'] < 3).astype(int), 'is_short', 'num'),
    (lambda df: (df['duration_min'] > 5).astype(int), 'is_long', 'num'),

    # степени
    (lambda df: df['energy'] ** 3, 'energy_cubed', 'num'), # топ-3
    (lambda df: df['danceability'] ** 2, 'danceability_squared', 'num'),
    (lambda df: df['loudness'] ** 2, 'loudness_squared', 'num'),
    (lambda df: df['valence'] ** 2, 'valence_squared', 'num'),
    (lambda df: df['acousticness'] ** 2, 'acousticness_squared', 'num'), # топ-1
    (lambda df: df['energy'] ** 2 * df['danceability'] ** 1.5, 'energy_dance', 'num'), # топ-4
    (lambda df: df['artist_count'] ** 2, 'artists_squared', 'num'),

    # показатели взаимодействия, от 0 до 1
    (lambda df: df['energy'] * df['danceability'] * df['tempo'] / 200, 'energy_dance_tempo', 'num'),
    (lambda df: df['valence'] * (1 - df['acousticness']) * df['energy'], 'happy_electronic', 'num'),
    (lambda df: (1 - df['valence']) * df['acousticness'] * df['instrumentalness'], 'sad_instrumental', 'num'),
    (lambda df: df['acousticness'] * (1 - df['energy']) * (1 - df['speechiness']), 'chill_quiet', 'num'), # топ-2
    (lambda df: df['valence'] * df['energy'] * (1 + df['speechiness']), 'emotional', 'num'),

    # коллабы
    (lambda df: df['artist_count'] * df['energy'], 'collab_energy', 'num'),
    (lambda df: df['artist_count'] * (1 - df['acousticness']), 'collab_electronic', 'num'),
    (lambda df: df['artist_count'] * df['danceability'], 'collab_dance', 'num'),

    # дополнительные
    (lambda df: (
        (df['loudness'] > -5) &
        (df['energy'] > 0.7) &
        (df['duration_min'] < 4) &
        (df['instrumentalness'] < 0.2)
    ).astype(int), 'is_radio_hit', 'num'),
    (lambda df: df['liveness'] * df['energy'] * (1 - df['instrumentalness']), 'live_performance', 'num')
]

for func, name, feature_type in features:
    tester.test_feature(func, name, feature_type)

result_df = tester.show_results()
result_df

Unnamed: 0,type,feature,r2_test,r2_improvement,rmse_test,rmse_improvement,correlation
15,num,acousticness_squared,0.054472,0.007322,21.657948,0.083694,-0.052957
21,num,chill_quiet,0.052864,0.005714,21.676356,0.065286,-0.0402
11,num,energy_cubed,0.051596,0.004446,21.690855,0.050787,-0.033014
16,num,energy_dance,0.051041,0.003892,21.697196,0.044446,-0.0229
3,num,is_acoustic,0.051014,0.003864,21.697513,0.044129,-0.063763
12,num,danceability_squared,0.050944,0.003794,21.698309,0.043334,0.018314
9,num,is_short,0.050466,0.003317,21.703769,0.037874,-0.050622
0,num,is_high_energy,0.049716,0.002566,21.712347,0.029295,-0.04615
20,num,sad_instrumental,0.049097,0.001948,21.719412,0.022231,-0.031955
5,num,is_speech_heavy,0.048741,0.001591,21.723479,0.018163,-0.018621


# Выводы после первого feature engineering
Самыми лучшими признаками оказались:
1. acousticness_squared
2. chill_quiet
3. energy_cubed
4. energy_dance
5. is_acoustic

Бесполезные признаки: 
- Бинарные
- Связанные с количеством артистов
- Признаки взаимодействия

Главное:
- Лучшее улучшение R^2: 0.0073
- Акустичность - ключевой параметр (большинство из топ-7 связаны с acousticness)
- Нелинейные преобразования работают (квадраты, кубы)
- Бинарные признаки слабы (кроме is_acoustic)
- Простые комбинации лучше сложных формул
- Корреляции всё ещё близки к нулю, как и данные признаки датасета

### Второй этап feature engineering

In [7]:
tester.reset_results()

features = [
    # из первого теста
    (lambda df: df['acousticness'] ** 2, 'acousticness_squared', 'num'),
    (lambda df: df['energy'] ** 3, 'energy_cubed', 'num'),
    (lambda df: df['acousticness'] * (1 - df['energy']) * (1 - df['speechiness']), 'chill_quiet', 'num'),
    
    # степени
    (lambda df: df['acousticness'] ** 1.8, 'acousticness_pow_1.8', 'num'),
    (lambda df: df['acousticness'] ** 2.2, 'acousticness_pow_2.2', 'num'),
    (lambda df: df['acousticness'] ** 2.5, 'acousticness_pow_2.5', 'num'), # топ-1
    (lambda df: df['energy'] ** 2, 'energy_squared', 'num'),
    (lambda df: df['energy'] ** 2.5, 'energy_pow_2.5', 'num'),
    (lambda df: df['energy'] ** 3.5, 'energy_pow_3.5', 'num'), # топ-3
    
    # изменение chill_quiet
    (lambda df: df['acousticness'] * (1 - df['energy']) ** 2, 'chill_quiet_v2', 'num'),
    (lambda df: df['acousticness'] * (1 - df['energy']) ** 2 * (1 - df['speechiness']), 'chill_quiet_v3', 'num'),
    (lambda df: df['acousticness'] ** 2 * (1 - df['energy']) * (1 - df['speechiness']), 'chill_quiet_v4', 'num'), # топ-2
]

for func, name, feature_type in features:
    tester.test_feature(func, name, feature_type)

result_df = tester.show_results()
result_df

Unnamed: 0,type,feature,r2_test,r2_improvement,rmse_test,rmse_improvement,correlation
5,num,acousticness_pow_2.5,0.054517,0.007367,21.657432,0.084211,-0.061049
4,num,acousticness_pow_2.2,0.054509,0.007359,21.657522,0.08412,-0.056494
0,num,acousticness_squared,0.054472,0.007322,21.657948,0.083694,-0.052957
3,num,acousticness_pow_1.8,0.054403,0.007253,21.658737,0.082905,-0.048947
11,num,chill_quiet_v4,0.05321,0.00606,21.672394,0.069249,-0.056448
2,num,chill_quiet,0.052864,0.005714,21.676356,0.065286,-0.0402
8,num,energy_pow_3.5,0.051777,0.004627,21.688791,0.052851,-0.037629
10,num,chill_quiet_v3,0.051679,0.004529,21.689911,0.051731,-0.050262
1,num,energy_cubed,0.051596,0.004446,21.690855,0.050787,-0.033014
9,num,chill_quiet_v2,0.051452,0.004302,21.692505,0.049137,-0.0517


### Обучение базовой модели с новыми признаками

In [8]:
new_cat = ['explicit', 'key', 'mode', 'time_signature', 'music_type']
new_num = ['danceability', 'energy', 'loudness', 'speechiness', 
       'acousticness', 'instrumentalness', 'liveness', 
       'valence', 'tempo', 'duration_min', 'artist_count',
       'acousticness_pow_2.5', 'chill_quiet', 'energy_pow_3.5']

features = [
    (lambda df: df['acousticness'] ** 2.5, 'acousticness_pow_2.5', 'num'), # топ-1
    (lambda df: df['acousticness'] ** 2 * (1 - df['energy']) * (1 - df['speechiness']), 'chill_quiet', 'num'), # топ-2
    (lambda df: df['energy'] ** 3.5, 'energy_pow_3.5', 'num') # топ-3
]

for func, feature_name, feature_type in features:
    df[feature_name] = func(df)

new_X = df[new_num + new_cat]
new_y = df['popularity']
new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(new_X, new_y, test_size=0.2, random_state=42)
new_tester = FeatureTesterPipeline(new_X_train, new_X_test, new_y_train, new_y_test, new_num, new_cat)
r2, rmse = new_tester.test_simple_model(LinearRegression)
print(f"Финальный R^2: {r2:.4f}")
print(f"Финальный RMSE: {rmse:.4f}")
print(f"R^2 улучшился на {r2 - base_r2:.4f}")
print(f"RMSE улучшился на {base_rmse - rmse:.4f}")

Финальный R^2: 0.0560
Финальный RMSE: 21.6403
R^2 улучшился на 0.0089
RMSE улучшился на 0.1013


# Конечные выводы
Работает:
- Степенные преобразования
- Акустичность
- Нелинейные зависимости

Не работает:
- Сложные формулы с многими взаимодействиями
- Бинарные признаки
- Положительные корреляции (все лучшие признаки отрицательные)

Популярность - это ПРОТИВОПОЛОЖНОСТЬ спокойной/акустичной музыке (отрицательная корреляция)

Чем меньше acousticness, тем выше популярность