### Importing libraries

In [1]:
# Numerical python library
import numpy as np
# Python DataFrame library
import pandas as pd
# Python Visualisation library
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
#Ignoring warnings
import warnings
warnings.filterwarnings('ignore')
# Stratified Sample split
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import  LinearRegression
from sklearn.cluster import KMeans

from sklearn.metrics import roc_auc_score, roc_curve

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.preprocessing import PolynomialFeatures

from imblearn.over_sampling import SMOTE

In [2]:
seed = 47
np.random.seed(47)

### Importing the datasets

In [3]:
#Train dataset
train = pd.read_csv('train.csv', index_col = 'id')
#Test dataset
test = pd.read_csv('test.csv', index_col = 'id') 
#Sample Submission dataset
sample_sub = pd.read_csv('sample_submission.csv')##Sample Submission dataset

### Splitting Dataset

In [4]:
X = train.drop('song_popularity', axis = 1).copy()
y = train['song_popularity'].copy()


X['song_duration_ms'] =  X['song_duration_ms']/60000
X['tempo'] =  X['tempo']/100
X['NUll_count'] = train.isnull().sum(axis = 1)

test['song_duration_ms'] =  test['song_duration_ms']/60000
test['tempo'] =  test['tempo']/100
test['NUll_count'] = test.isnull().sum(axis = 1)

### Filling Missing values

In [5]:
columns = test.columns

In [6]:
it_imputer = IterativeImputer(max_iter = 50, random_state = seed)

X_it = pd.DataFrame(it_imputer.fit_transform(X), columns = columns)
                            
test_it = pd.DataFrame(it_imputer.transform(test), columns = columns)

### Feature Binning

In [7]:
bin_col = ['energy','liveness','acousticness']
for col in bin_col:
    X_it[f'{col}_bin'] = pd.cut(X_it[col],5, labels=['A','B','C','D','E'])
    test_it[f'{col}_bin'] = pd.cut(test_it[col],5, labels=['A','B','C','D','E'])
    
X_it = pd.get_dummies(X_it, drop_first= True)
test_it = pd.get_dummies(test_it, drop_first= True)

### Clustering

In [8]:
Cluster_1 = ['instrumentalness', 'key', 'liveness','loudness', 'audio_mode']
Cluster_2 = ['song_duration_ms', 'acousticness', 'danceability']
Cluster_3 = ['acousticness', 'danceability', 'energy']
Cluster_4 = ['speechiness', 'tempo', 'time_signature', 'audio_valence']

cluster = KMeans(n_clusters=5, random_state= 2022)

for (clus_name, clus) in [('Cluster_1',Cluster_1),('Cluster_2',Cluster_2),('Cluster_3',Cluster_3),('Cluster_4',Cluster_4)]:
    X_it[clus_name] = cluster.fit_predict(X_it[clus])
    test_it[clus_name] = cluster.fit_predict(test_it[clus])

### Feature Interaction

### SMOTE

### Data Transformation

imputer = SimpleImputer(missing_values = np.nan, strategy = "median", add_indicator= True)
X_filled = pd.DataFrame(imputer.fit_transform(X_train_))

### CROSS VALIDATION FUNCTION

### SEARCHING FOR THE BEST FEATURE INTERACTIONS

# Trying different Models

In [9]:
scale_pos_weight = len(train['song_popularity'][train['song_popularity'].eq(0)])/ len(train['song_popularity'][train['song_popularity'].eq(1)])
skfold = StratifiedKFold(n_splits = 10, random_state = seed, shuffle = True)

### LGBM1

In [10]:
model_LGBM1 = LGBMClassifier(n_estimators=1000,learning_rate=0.01,subsample_freq=5,
                            colsample_bytree=0.8,min_child_samples=10,random_state=465,
                            subsample=0.7,num_leaves=100,metric='auc',
                           max_depth=7,scale_pos_weight= scale_pos_weight, use_missing = True)

train_roc_lgbm1, val_roc_lgbm1 = [], []

train_pred_lgbm1 = np.zeros(len(X_it))
val_pred_lgbm1 = np.zeros(len(X_it))
test_pred_lgbm1 = np.zeros(len(test))
fold = 0


for train_index, val_index in skfold.split(X_it, y):
    X_train = X_it.iloc[train_index] ; y_train = y.iloc[train_index].values
    X_val = X_it.iloc[val_index] ; y_val = y.iloc[val_index].values
    fold += 1
    print(f'***************************Fold :{fold}***********************************************')
    
    model_LGBM1.fit(X_train, y_train, early_stopping_rounds = 200, eval_metric="auc",
                       eval_set=[(X_val, y_val)],verbose=100)
    
    train_pred = model_LGBM1.predict_proba(X_train)[:,1]
    val_pred = model_LGBM1.predict_proba(X_val)[:,1]

    train_pred_lgbm1[train_index] = train_pred
    val_pred_lgbm1[val_index] = val_pred
    
    test_pred_lgbm1 += model_LGBM1.predict_proba(test_it)[:,1]
    
    print(f'Train score : {roc_auc_score(y_train, train_pred)}')
    print(f'Validation score : {roc_auc_score(y_val, val_pred)}\n')
    
   
    train_roc_lgbm1.append(roc_auc_score(y_train, train_pred)); val_roc_lgbm1.append(roc_auc_score(y_val,val_pred))
    
test_pred_lgbm1 = test_pred_lgbm1 / 10
print(f'Training ROC score : {np.mean(train_roc_lgbm1)}')
print(f'Testing ROC score : {np.mean(val_roc_lgbm1)} +/- {np.std(val_roc_lgbm1)}')

***************************Fold :1***********************************************
Training until validation scores don't improve for 200 rounds
[100]	valid_0's auc: 0.566752
[200]	valid_0's auc: 0.568751
[300]	valid_0's auc: 0.570987
[400]	valid_0's auc: 0.572315
[500]	valid_0's auc: 0.570888
[600]	valid_0's auc: 0.569415
Early stopping, best iteration is:
[403]	valid_0's auc: 0.572522
Train score : 0.7820554804878882
Validation score : 0.572522415415728

***************************Fold :2***********************************************
Training until validation scores don't improve for 200 rounds
[100]	valid_0's auc: 0.570091
[200]	valid_0's auc: 0.577319
[300]	valid_0's auc: 0.576774
[400]	valid_0's auc: 0.57496
Early stopping, best iteration is:
[262]	valid_0's auc: 0.57863
Train score : 0.7509601763784859
Validation score : 0.5786303985991395

***************************Fold :3***********************************************
Training until validation scores don't improve for 200 roun

### LGBM 2

In [11]:
model_LGBM2 = LGBMClassifier(n_estimators= 200,learning_rate=0.05,subsample_freq=5,
                            colsample_bytree=0.5,min_child_samples=20,random_state=2022,tree_learner = 'serial',
                            subsample=0.7,num_leaves=20,metric='auc',
                           max_depth=7,scale_pos_weight= scale_pos_weight, use_missing = True, reg_alpha = 0.08, lambda_l1 = 1.04, lambda_l2 = 1)

train_roc_lgbm2, val_roc_lgbm2 = [], []

train_pred_lgbm2 = np.zeros(len(X_it))
val_pred_lgbm2 = np.zeros(len(X_it))
test_pred_lgbm2 = np.zeros(len(test))
fold = 0


for train_index, val_index in skfold.split(X_it, y):
    X_train = X_it.iloc[train_index] ; y_train = y.iloc[train_index].values
    X_val = X_it.iloc[val_index] ; y_val = y.iloc[val_index].values
    fold += 1
    print(f'***************************Fold :{fold}***********************************************')
    
    model_LGBM2.fit(X_train, y_train, early_stopping_rounds = 200, eval_metric="auc",
                       eval_set=[(X_val, y_val)],verbose=100)
    
    train_pred = model_LGBM2.predict_proba(X_train)[:,1]
    val_pred = model_LGBM2.predict_proba(X_val)[:,1]

    train_pred_lgbm2[train_index] = train_pred
    val_pred_lgbm2[val_index] = val_pred
    
    test_pred_lgbm2 += model_LGBM2.predict_proba(test_it)[:,1]
    
    print(f'Train score : {roc_auc_score(y_train, train_pred)}')
    print(f'Validation score : {roc_auc_score(y_val, val_pred)}\n')
    
   
    train_roc_lgbm2.append(roc_auc_score(y_train, train_pred)); val_roc_lgbm2.append(roc_auc_score(y_val,val_pred))
    
test_pred_lgbm2 = test_pred_lgbm2 / 10
print(f'Training ROC score : {np.mean(train_roc_lgbm2)}')
print(f'Testing ROC score : {np.mean(val_roc_lgbm2)} +/- {np.std(val_roc_lgbm2)}')

***************************Fold :1***********************************************
Training until validation scores don't improve for 200 rounds
[100]	valid_0's auc: 0.570381
[200]	valid_0's auc: 0.569246
Did not meet early stopping. Best iteration is:
[90]	valid_0's auc: 0.571446
Train score : 0.6462731365931154
Validation score : 0.5714458061223416

***************************Fold :2***********************************************
Training until validation scores don't improve for 200 rounds
[100]	valid_0's auc: 0.576545
[200]	valid_0's auc: 0.572959
Did not meet early stopping. Best iteration is:
[57]	valid_0's auc: 0.57978
Train score : 0.6314944288370924
Validation score : 0.5797804192055871

***************************Fold :3***********************************************
Training until validation scores don't improve for 200 rounds
[100]	valid_0's auc: 0.573925
[200]	valid_0's auc: 0.572034
Did not meet early stopping. Best iteration is:
[88]	valid_0's auc: 0.574759
Train score : 

### XGBOOST

In [12]:
model_xgb = XGBClassifier(learning_rate=0.01,subsample=0.7,colsample_bytree=0.9,reg_alpha=10,
               n_jobs=-1,n_estimators=1500,max_depth= 5,random_state=34)

train_roc_xgb, val_roc_xgb = [], []

train_pred_xgb = np.zeros(len(X_it))
val_pred_xgb = np.zeros(len(X_it))
test_pred_xgb = np.zeros(len(test))
fold = 0

for train_index, val_index in skfold.split(X_it, y):
    X_train = X_it.iloc[train_index] ; y_train = y.iloc[train_index].values
    X_val = X_it.iloc[val_index] ; y_val = y.iloc[val_index].values
    fold += 1
    print(f'***************************Fold :{fold}***********************************************')
    
    model_xgb.fit(X_train, y_train, early_stopping_rounds = 200, eval_metric="auc",
                       eval_set=[(X_val, y_val)],verbose=250)
    
    train_pred = model_xgb.predict_proba(X_train, ntree_limit = model_xgb.get_booster().best_ntree_limit)[:,1]
    val_pred = model_xgb.predict_proba(X_val, ntree_limit = model_xgb.get_booster().best_ntree_limit)[:,1]

    train_pred_xgb[train_index] = train_pred
    val_pred_xgb[val_index] = val_pred
    
    test_pred_xgb += model_xgb.predict_proba(test_it)[:,1]
    
    print(f'Train score : {roc_auc_score(y_train, train_pred)}')
    print(f'Validation score : {roc_auc_score(y_val, val_pred)}\n')
    
   
    train_roc_xgb.append(roc_auc_score(y_train, train_pred)); val_roc_xgb.append(roc_auc_score(y_val,val_pred))
    
test_pred_xgb = test_pred_xgb / 10
print(f'Training ROC score : {np.mean(train_roc_xgb)}')
print(f'Testing ROC score : {np.mean(val_roc_xgb)} +/- {np.std(val_roc_xgb)}')

***************************Fold :1***********************************************
[0]	validation_0-auc:0.53648
[250]	validation_0-auc:0.56496
[500]	validation_0-auc:0.57039
[750]	validation_0-auc:0.57123
[1000]	validation_0-auc:0.57140
[1250]	validation_0-auc:0.57081
[1375]	validation_0-auc:0.57007
Train score : 0.696283459042818
Validation score : 0.5715710371857989

***************************Fold :2***********************************************
[0]	validation_0-auc:0.53401
[250]	validation_0-auc:0.57864
[500]	validation_0-auc:0.58032
[634]	validation_0-auc:0.58000
Train score : 0.6434232203478575
Validation score : 0.5807882053929786

***************************Fold :3***********************************************
[0]	validation_0-auc:0.53621
[250]	validation_0-auc:0.56966
[500]	validation_0-auc:0.57236
[746]	validation_0-auc:0.57231
Train score : 0.6505152285091139
Validation score : 0.5727426493549116

***************************Fold :4*******************************************

### CATBOOST

In [13]:
model_catboost =  CatBoostClassifier(random_seed=34,bootstrap_type='Bayesian',max_depth=6,learning_rate=0.007,
                          iterations=8000,silent=True,eval_metric='AUC')

train_roc_cat, val_roc_cat = [], []

train_pred_cat = np.zeros(len(X_it))
val_pred_cat = np.zeros(len(X_it))
test_pred_cat = np.zeros(len(test))
fold = 0

for train_index, val_index in skfold.split(X_it, y):
    X_train = X_it.iloc[train_index] ; y_train = y.iloc[train_index].values
    X_val = X_it.iloc[val_index] ; y_val = y.iloc[val_index].values
    fold += 1
    print(f'***************************Fold :{fold}***********************************************')
    
    model_catboost.fit(X_train, y_train, early_stopping_rounds = 200,
                       eval_set=[(X_val, y_val)],verbose=250,use_best_model=True)
    
    train_pred = model_catboost.predict_proba(X_train)[:,1]
    val_pred = model_catboost.predict_proba(X_val)[:,1]

    train_pred_cat[train_index] = train_pred
    val_pred_cat[val_index] = val_pred
    
    test_pred_cat += model_catboost.predict_proba(test_it)[:,1]
    
    print(f'Train score : {roc_auc_score(y_train, train_pred)}')
    print(f'Validation score : {roc_auc_score(y_val, val_pred)}\n')
    
   
    train_roc_cat.append(roc_auc_score(y_train, train_pred)); val_roc_cat.append(roc_auc_score(y_val,val_pred))
    
test_pred_cat = test_pred_cat / 10
print(f'Training ROC score : {np.mean(train_roc_cat)}')
print(f'Testing ROC score : {np.mean(val_roc_cat)} +/- {np.std(val_roc_cat)}')

***************************Fold :1***********************************************
0:	test: 0.5257092	best: 0.5257092 (0)	total: 101ms	remaining: 13m 28s
250:	test: 0.5602347	best: 0.5602676 (247)	total: 8.13s	remaining: 4m 11s
500:	test: 0.5656277	best: 0.5656401 (499)	total: 16.3s	remaining: 4m 3s
750:	test: 0.5683215	best: 0.5683215 (750)	total: 24.4s	remaining: 3m 55s
1000:	test: 0.5699201	best: 0.5699368 (999)	total: 32.6s	remaining: 3m 48s
1250:	test: 0.5707948	best: 0.5709446 (1205)	total: 40.8s	remaining: 3m 40s
1500:	test: 0.5714671	best: 0.5714671 (1500)	total: 49s	remaining: 3m 31s
1750:	test: 0.5718212	best: 0.5719851 (1643)	total: 57.1s	remaining: 3m 23s
2000:	test: 0.5718177	best: 0.5723294 (1917)	total: 1m 5s	remaining: 3m 15s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.5723294408
bestIteration = 1917

Shrink model to first 1918 iterations.
Train score : 0.6768828386617752
Validation score : 0.572329440824409

***************************Fold :2***

### BLENDING

In [14]:
## TRAIN Sdf
Train_stack = pd.concat([
    pd.DataFrame(train_pred_lgbm1),pd.DataFrame(train_pred_lgbm2),
    pd.DataFrame(train_pred_xgb),pd.DataFrame(train_pred_cat)
],axis = 1)
Train_stack.columns = ['LGBM(1)','LGBM(2)','XGB','CATBOOST']

## TEST STACK
Test_stack = pd.concat([
    pd.DataFrame(test_pred_lgbm1),pd.DataFrame(test_pred_lgbm2),
    pd.DataFrame(test_pred_xgb),pd.DataFrame(test_pred_cat)
],axis = 1)
Test_stack.columns = ['LGBM(1)','LGBM(2)','XGB','CATBOOST']


In [15]:
weighted_value = (3*Train_stack['LGBM(1)'] + 3*Train_stack['LGBM(2)'] + 5*Train_stack['XGB'] + 7*Train_stack['CATBOOST']) / 18
roc_auc_score(y,weighted_value)

0.6723150990377728

### Submission file

In [16]:
sample_sub['song_popularity'] = (3*Test_stack['LGBM(1)'] + 3*Test_stack['LGBM(2)'] + 5*Test_stack['XGB'] + 7*Test_stack['CATBOOST']) / 18
sample_sub.to_csv('Blend_1.csv',index= False)