**Important** : As mentionned in the readme : To run the notebook, make sure you have files named train.csv and test.csv in the runtime environment base folder such that google colab code can access training and test data with urls: train.csv and test.csv.

In [None]:
!pip install scikit-learn
!pip install numpy
!pip install pandas



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import pandas as pd
import sklearn

# Initial data exploration and removing empty rows

In [None]:
def get_data_frames() :
  data = pd.read_csv('train.csv')
  data_to_predict = pd.read_csv('test.csv')
  return (data, data_to_predict)

def clean_data(data, is_data_to_predict) :
  columns_to_drop = ['Id']#, 'Concert ID', 'Concert Goer ID']
  Ids = data['Id']
  data = data.drop(columns=columns_to_drop)
  if not is_data_to_predict : 
      data = data.dropna()
  return data, Ids

def get_X_y(data) :
  target_variable = 'Concert Enjoyment'
  X = data.drop(columns = [target_variable])
  y = data[target_variable]

  return X,y


In [None]:
train_data, test_data = get_data_frames()
na_rows = train_data.isna().any(axis=1)
number_of_invalid_row = np.sum(na_rows)
number_of_row = len(train_data)

print('% of invalid lines', (number_of_invalid_row/ number_of_row) * 100, ' %')


% of invalid lines 8.221764705882354  %


In [None]:
train_data.info()
print(train_data['Rain'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170000 entries, 0 to 169999
Data columns (total 19 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Id                              170000 non-null  object 
 1   Band Name                       169141 non-null  object 
 2   Band Genre                      169116 non-null  object 
 3   Band Country of Origin          169210 non-null  object 
 4   Band Debut                      169143 non-null  float64
 5   Concert ID                      169130 non-null  float64
 6   Concert Attendance              169105 non-null  float64
 7   Inside Venue                    169162 non-null  object 
 8   Rain                            169139 non-null  object 
 9   Seated                          169168 non-null  object 
 10  Personnality Trait 1            169148 non-null  float64
 11  Personnality Trait 2            169151 non-null  float64
 12  Personnality Tra

In [None]:
x = ['Band Name', 'Band Genre', 'Band Country of Origin', 'Concert Goer Country of Origin','Concert ID', 'Concert Goer ID']
for col in x :
  print(len(train_data[col].unique()))


55
9
5
153
1001
2001


In [None]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Id                              30000 non-null  object 
 1   Band Name                       29864 non-null  object 
 2   Band Genre                      29862 non-null  object 
 3   Band Country of Origin          29843 non-null  object 
 4   Band Debut                      29843 non-null  float64
 5   Concert ID                      29842 non-null  float64
 6   Concert Attendance              29848 non-null  float64
 7   Inside Venue                    29834 non-null  object 
 8   Rain                            29857 non-null  object 
 9   Seated                          29858 non-null  object 
 10  Personnality Trait 1            29865 non-null  float64
 11  Personnality Trait 2            29847 non-null  float64
 12  Personnality Trait 3            

In [None]:
number_of_unique_goer_id = len(train_data['Concert Goer ID'].unique())
number_of_unique_id = len(train_data['Id'].unique())
number_of_unique_concert_id = len(train_data['Concert ID'].unique())



print(number_of_unique_goer_id)
print(number_of_unique_id)
print(number_of_unique_concert_id)



2001
170000
1001


In [None]:
train_data['Concert Enjoyment'].value_counts()

Enjoyed               68026
Did Not Enjoy         67945
Best Concert Ever     17027
Worst Concert Ever    17002
Name: Concert Enjoyment, dtype: int64

In [None]:
train_data = clean_data(train_data, True)


In [None]:
train_data['Concert Enjoyment'].value_counts()

Enjoyed               62514
Did Not Enjoy         62327
Worst Concert Ever    15596
Best Concert Ever     15586
Name: Concert Enjoyment, dtype: int64

# 1 - Data Preprocessing


In [None]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

def preprocess_target_data(y_data) :
  order = ['Worst Concert Ever','Did Not Enjoy','Enjoyed','Best Concert Ever']
  encoder = OrdinalEncoder(categories=[order])
  y_data = encoder.fit_transform(y_data.to_numpy().reshape(-1,1))
  return y_data.reshape((len(y_data))), encoder 

def decode_predictions(y_predicted, encoder) :
  y_predicted = encoder.inverse_transform(y_predicted)
  return y_predicted

def get_features_preprocessor() :
  numeric_features = ["Band Debut", "Concert Attendance", "Personnality Trait 1","Personnality Trait 2","Personnality Trait 3", "Personnality Trait 4","Concert Goer Age","Height (cm)"]
  numeric_transformer = Pipeline(
      steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler()), ("pca", PCA(n_components=0.95))]
  )

  #features with a low number of class will be one hot encoded
  ohe_features = ['Band Name', 'Band Genre', 'Band Country of Origin', 'Concert Goer Country of Origin','Concert ID', 'Concert Goer ID']
  ohe_transformer = Pipeline(
      steps=[("imputer", SimpleImputer(strategy="most_frequent")), ("OneHotEncoder", OneHotEncoder())]# great _score with ('SVD', TruncatedSVD(n_components=400))] #('SVD', TruncatedSVD(n_components=0.95))]
  )



  preprocessor = ColumnTransformer(transformers=[
        ("num", numeric_transformer, numeric_features),
        ("ohe", ohe_transformer, ohe_features),
    ])
  
  return preprocessor

def find_svd_components(preprocessed_X) :
  explained_variance = 0
  n_components = 1000
  while explained_variance < 0.90 : 
    svd = TruncatedSVD(n_components = n_components)
    svd.fit(preprocessed_X)
    explained_variance = svd.explained_variance_ratio_.sum()
    print('ncomponent', n_components, 'explained ', explained_variance)
    n_components = n_components + 50
    break
  
  print('svd components :', n_components - 10)
  return svd


def preprocess_features(X, X_to_predict) :
  features_preprocessor = get_features_preprocessor()
  X = features_preprocessor.fit_transform(X)
  X_to_predict = features_preprocessor.transform(X_to_predict)

  svd = find_svd_components(X)
  X = svd.transform(X)
  X_to_predict = svd.transform(X_to_predict)


  return X, X_to_predict



In [None]:

def preprocess_data(data, data_to_predict) : 
  data, _ = clean_data(data, False)
  data_to_predict, Ids = clean_data(data_to_predict, True)
  X, y = get_X_y(data)

  X, data_to_predict = preprocess_features(X, data_to_predict)

  y, y_encoder = preprocess_target_data(y)


  return (X, y), (data_to_predict, Ids), y_encoder



In [None]:
def get_preprocessed_data() :
  data, data_to_predict = get_data_frames()
  return preprocess_data(data, data_to_predict)

data, data_to_predict, y_encoder = get_preprocessed_data()
X, y=  data 

X_to_predict, Ids = data_to_predict

print(X.shape)
print(y.shape)
print(X_to_predict.shape)

ncomponent 1000 explained  0.9174150281677615
svd components : 1040
(156023, 1000)
(156023,)
(30000, 1000)


# 2 - Model Selection

In [None]:
from sklearn.model_selection import train_test_split
# Shuffle is True
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

def grid_search(configurations, X_train, y_train, X_test, y_test, is_base_models) :
  best_trained_models = []
  for clf, h_parameter in configurations : 
    print(100*'-')
    print("Grid search for model :  ",clf, '\n')
    
    if is_base_models :
       clf = clf(**h_parameter) 
       clf.fit(X_train, y_train)
       best_model_predictions = clf.predict(X_test)
       best_trained_models.append(clf)

     
    else :
      grid = GridSearchCV(estimator = clf(),param_grid = h_parameter, cv=3, scoring='f1_micro', verbose = 10, refit = True)# n_jobs = -1)
      grid.fit(X_train, y_train)
      print('Val score : ', grid.best_score_)
      best_model_predictions = grid.predict(X_test)
      best_trained_models.append(grid)
      print('Best params : ', grid.get_params())

    test_score = f1_score(y_test, best_model_predictions, average='micro')
    
    print("Retrained model with all training set Test score : ", test_score)
     
  return best_trained_models

### 2.1 Study model with base params

In [None]:
from sklearn.ensemble import RandomForestClassifier, S
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

base_params = {}
classifiers_config = [
    (RandomForestClassifier, base_params),
    (AdaBoostClassifier, base_params),
    (GradientBoostingClassifier, base_params),
    (MLPClassifier, base_params)
]

best_trained_models = grid_search(classifiers_config, X_train, y_train, X_test, y_test, True )

----------------------------------------------------------------------------------------------------
Grid search for model :   <class 'sklearn.ensemble._forest.RandomForestClassifier'> 



KeyboardInterrupt: ignored

## 2.2 MLP params tuning


In [None]:
from sklearn.neural_network import MLPClassifier
mlp_params = {'hidden_layer_sizes': [ (1000), (500), (50,50,50), (30,30,30,30,30) , (100), (200), (300), (100,100), (200,200), (300,300)], 'early_stopping' : [True], 'alpha' : [0.0001],'solver': ['adam'], 'max_iter': [1000], }

classifiers_config = [
    (MLPClassifier, mlp_params)
]

best_trained_models = grid_search(classifiers_config, X_train, y_train, X_test, y_test, False)

----------------------------------------------------------------------------------------------------
Grid search for model :   <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'> 

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3; 1/10] START alpha=0.0001, early_stopping=True, hidden_layer_sizes=1000, max_iter=1000, solver=adam
[CV 1/3; 1/10] END alpha=0.0001, early_stopping=True, hidden_layer_sizes=1000, max_iter=1000, solver=adam;, score=0.648 total time= 4.5min
[CV 2/3; 1/10] START alpha=0.0001, early_stopping=True, hidden_layer_sizes=1000, max_iter=1000, solver=adam
[CV 2/3; 1/10] END alpha=0.0001, early_stopping=True, hidden_layer_sizes=1000, max_iter=1000, solver=adam;, score=0.644 total time= 4.3min
[CV 3/3; 1/10] START alpha=0.0001, early_stopping=True, hidden_layer_sizes=1000, max_iter=1000, solver=adam
[CV 3/3; 1/10] END alpha=0.0001, early_stopping=True, hidden_layer_sizes=1000, max_iter=1000, solver=adam;, score=0.645 total time= 4.3min

# GradBoosting

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
gbc_params = {'n_estimators':[100, 300, 500, 1000], 'max_depth':[3,5,6]}

classifiers_config = [
    (GradientBoostingClassifier, gbc_params)
]

best_trained_models = grid_search(classifiers_config, X_train, y_train, X_test, y_test, False)

----------------------------------------------------------------------------------------------------
Grid search for model :   <class 'sklearn.ensemble._gb.GradientBoostingClassifier'> 

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV 1/3; 1/12] START max_depth=3, n_estimators=100..............................
[CV 1/3; 1/12] END max_depth=3, n_estimators=100;, score=0.626 total time=416.6min
[CV 2/3; 1/12] START max_depth=3, n_estimators=100..............................


In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
rfc_params = {'n_estimators':[50,75,100,150, 200, 300]}

classifiers_config = [
    (RandomForestClassifier, rfc_params),
    (SVC,  {})
]

best_trained_models = grid_search(classifiers_config, X_train, y_train, X_test, y_test, False)

----------------------------------------------------------------------------------------------------
Grid search for model :   <class 'sklearn.ensemble._forest.RandomForestClassifier'> 

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3; 1/6] START n_estimators=50.............................................
[CV 1/3; 1/6] END ..............n_estimators=50;, score=0.626 total time= 3.8min
[CV 2/3; 1/6] START n_estimators=50.............................................
[CV 2/3; 1/6] END ..............n_estimators=50;, score=0.622 total time= 3.9min
[CV 3/3; 1/6] START n_estimators=50.............................................
[CV 3/3; 1/6] END ..............n_estimators=50;, score=0.622 total time= 3.8min
[CV 1/3; 2/6] START n_estimators=75.............................................
[CV 1/3; 2/6] END ..............n_estimators=75;, score=0.628 total time= 5.9min
[CV 2/3; 2/6] START n_estimators=75.............................................
[CV 2/3; 2/6] END ......

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc_params = {'n_estimators':[500,750,1000,1500]}

classifiers_config = [
    (RandomForestClassifier, rfc_params),
]

best_trained_models = grid_search(classifiers_config, X_train, y_train, X_test, y_test, False)

----------------------------------------------------------------------------------------------------
Grid search for model :   <class 'sklearn.ensemble._forest.RandomForestClassifier'> 

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3; 1/4] START n_estimators=500............................................
[CV 1/3; 1/4] END .............n_estimators=500;, score=0.637 total time=33.0min
[CV 2/3; 1/4] START n_estimators=500............................................
[CV 2/3; 1/4] END .............n_estimators=500;, score=0.630 total time=32.8min
[CV 3/3; 1/4] START n_estimators=500............................................
[CV 3/3; 1/4] END .............n_estimators=500;, score=0.632 total time=32.9min
[CV 1/3; 2/4] START n_estimators=750............................................
[CV 1/3; 2/4] END .............n_estimators=750;, score=0.636 total time=49.0min
[CV 2/3; 2/4] START n_estimators=750............................................
[CV 2/3; 2/4] END ......

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc_params = {'n_estimators':[100,300, 500,600], 'class_weight': ['balanced']}

classifiers_config = [
    (RandomForestClassifier, rfc_params),
]

best_trained_models = grid_search(classifiers_config, X_train, y_train, X_test, y_test, False)

----------------------------------------------------------------------------------------------------
Grid search for model :   <class 'sklearn.ensemble._forest.RandomForestClassifier'> 

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3; 1/4] START class_weight=balanced, n_estimators=100.....................
[CV 1/3; 1/4] END class_weight=balanced, n_estimators=100;, score=0.631 total time= 5.7min
[CV 2/3; 1/4] START class_weight=balanced, n_estimators=100.....................
[CV 2/3; 1/4] END class_weight=balanced, n_estimators=100;, score=0.625 total time= 5.7min
[CV 3/3; 1/4] START class_weight=balanced, n_estimators=100.....................
[CV 3/3; 1/4] END class_weight=balanced, n_estimators=100;, score=0.627 total time= 5.7min
[CV 1/3; 2/4] START class_weight=balanced, n_estimators=300.....................
[CV 1/3; 2/4] END class_weight=balanced, n_estimators=300;, score=0.632 total time=17.2min
[CV 2/3; 2/4] START class_weight=balanced, n_estimators=300......

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_params = {'n_neighbors': [25,50,150,200,250, 500]}#5, 10, 15, 20, 25, 50, 100,200]}
classifiers_config = [
    (KNeighborsClassifier, knn_params)
]

best_trained_models = grid_search(classifiers_config, X_train, y_train, X_test, y_test, False)

----------------------------------------------------------------------------------------------------
Grid search for model :   <class 'sklearn.neighbors._classification.KNeighborsClassifier'> 

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3; 1/6] START n_neighbors=25..............................................
[CV 1/3; 1/6] END ...............n_neighbors=25;, score=0.604 total time= 2.1min
[CV 2/3; 1/6] START n_neighbors=25..............................................
[CV 2/3; 1/6] END ...............n_neighbors=25;, score=0.602 total time= 2.0min
[CV 3/3; 1/6] START n_neighbors=25..............................................
[CV 3/3; 1/6] END ...............n_neighbors=25;, score=0.604 total time= 2.1min
[CV 1/3; 2/6] START n_neighbors=50..............................................
[CV 1/3; 2/6] END ...............n_neighbors=50;, score=0.605 total time= 2.1min
[CV 2/3; 2/6] START n_neighbors=50..............................................
[CV 2/3; 2/6] END

# Report Generation

In [None]:
from google.colab import files
def generate_predictions_report(best_trained_models, data_to_predict, y_encoder) :
  X_to_predict, Ids = data_to_predict
  for idx,model in enumerate(best_trained_models) :
    
    predictions = model.predict(X_to_predict)
    predictions = decode_predictions(predictions.reshape(-1,1), y_encoder).reshape(len(predictions))
    report = pd.DataFrame({'Id': Ids, 'Predicted' : predictions })

    report.to_csv(f'{idx}_report.csv', index = False)
    files.download(f'{idx}_report.csv')

generate_predictions_report(best_trained_models, data_to_predict, y_encoder)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>