In [24]:
import pandas as pd, numpy as np
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,  cross_val_score, KFold, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
SEED = 42

In [14]:
path_train = 'data/train.csv'
df = pd.read_csv(path_train)
df.set_index('id',inplace=True)

# EDA

In [15]:
df.tail()

Unnamed: 0_level_0,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,song_popularity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
39995,237799.0,,0.748942,0.511234,0.002913,0.0,0.173803,-8.853673,0,0.07806,168.262924,3,0.178159,0
39996,191119.0,0.067488,0.67283,0.889685,0.001225,5.0,0.122924,-7.798993,0,0.188607,110.684544,3,0.790626,0
39997,160879.0,0.877431,0.409065,0.292671,,10.0,0.110664,-15.378585,0,0.031294,99.556074,3,0.177947,1
39998,193918.0,,0.365738,,0.000339,1.0,0.356308,-4.661977,1,0.054096,139.857384,3,0.772978,0
39999,196475.0,0.007116,0.354585,0.994883,0.002191,1.0,0.2009,-4.875249,0,0.080549,101.974949,3,0.588549,0


In [6]:
df.dtypes

id                    int64
song_duration_ms    float64
acousticness        float64
danceability        float64
energy              float64
instrumentalness    float64
key                 float64
liveness            float64
loudness            float64
audio_mode            int64
speechiness         float64
tempo               float64
time_signature        int64
audio_valence       float64
song_popularity       int64
dtype: object

In [22]:
X = df.drop('song_popularity',axis=1).copy()
y = df['song_popularity'].copy()

# Split Base

In [25]:
folds = 5
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1 / folds), random_state=SEED)

# Pipeline

In [27]:
num_preprocessing = make_pipeline( SimpleImputer(missing_values=np.nan, strategy='median'))
rf = RandomForestClassifier(random_state=SEED)

pipe_baseline = Pipeline([
    ('Impute missing values',num_preprocessing),
    ('Model',rf)
        ])


In [28]:
pipe_baseline.fit(X_train,y_train)

Pipeline(steps=[('Impute missing values',
                 Pipeline(steps=[('simpleimputer',
                                  SimpleImputer(strategy='median'))])),
                ('Model', RandomForestClassifier(random_state=42))])

# Model Valuation

In [42]:
X_pred= pipe_baseline.named_steps["Impute missing values"].transform(X_test)
y_pred = pipe_baseline.named_steps["Model"].predict(X_pred)

In [46]:
#acurácia
pipe_baseline.named_steps["Model"].score(X_pred,y_test)

0.62625

In [51]:
pipe_baseline.score(X_test,y_test)

0.62625

# Model & Results Saving

In [59]:
import joblib,datetime

now = datetime.datetime.now()
joblib.dump(pipe_baseline,f"artifacts/pipe_baseline_{now}.joblib")
joblib.dump(pipe_baseline,f"artifacts/pipe_baseline.joblib")

['artifacts/pipe_baseline_2022-01-18 20:38:17.024131.joblib']

In [69]:
date_test = pd.read_csv("data/test.csv")
date_test.set_index('id',inplace=True)
pred = pipe_baseline.predict(date_test)

ids = date_test.index
data = {'id':ids,'song_popularity':pred}
df_submission = pd.DataFrame(data)
df_submission.to_csv(f'submissions/data_{now}.csv',index=False)