In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_df = pd.read_csv("../input/song-popularity-prediction/train.csv")

In [3]:
features = [
    'song_duration_ms',
    'acousticness',
    'danceability',
    'energy',
    'liveness',
    'loudness',
    'speechiness',
    'tempo',
    'audio_valence'
]

In [4]:
features = train_df.columns.tolist()
features.remove('song_popularity')

In [5]:
pipeline = Pipeline([
    ('impute', SimpleImputer()),
    ('scale', StandardScaler()),
    ('classifier', RandomForestClassifier(class_weight={0:1,1:2}, min_samples_leaf=2, n_estimators=400)),
])

In [6]:
X = train_df[features]
y = train_df['song_popularity']

In [7]:
kfold = StratifiedKFold(n_splits=5)

In [8]:
param_grid = {
    'classifier__n_estimators' : [100, 150, 200, 250, 300, 350, 400],
    'classifier__min_samples_leaf' : [2, 3, 4]
}

In [9]:
pipeline.fit(X, y)

Pipeline(steps=[('impute', SimpleImputer()), ('scale', StandardScaler()),
                ('classifier',
                 RandomForestClassifier(class_weight={0: 1, 1: 2},
                                        min_samples_leaf=2,
                                        n_estimators=400))])

In [10]:
test_df = pd.read_csv('../input/song-popularity-prediction/test.csv')

In [11]:
X_sub = test_df[features]
test_df['song_popularity'] = pipeline.predict(X_sub)

In [12]:
test_df[['id', 'song_popularity']].to_csv('submission.csv', index=False)