# The Initial Model Build Up : The Max Ensemble Model
<hr>
Note : This is Initial Model File to Generate Initial Solution.csv on Test Dataset<br><hr>

**Importing All Libraries**

In [1]:
import numpy as np
import pandas as pd

from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from tqdm.notebook import tqdm
    
from xgboost import XGBRegressor
from scipy import stats
import dask
from dask.distributed import Client
import dill

**Loading Datasets**

In [None]:
df = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")
df.head()

**Converting Any Column to One Hot Encodings**

In [6]:
def get_one_hot_df(df,column):
    # Converting Column to One Hot Encoding...
    column_id = list(df.columns).index(column)
    one_hot = pd.get_dummies(df.loc[:,column]).iloc[:,:-1]
    one_hot.columns = [column+"_"+str(x).lower() for x in one_hot.columns]

    df = pd.concat([ df.iloc[:,:column_id], one_hot , df.iloc[:,(column_id+1):]],axis=1)
    return df

In [7]:
df = pd.read_csv("./Datasets/train.csv")
# Converting Column to One Hot Encoding...
df = get_one_hot_df(df,"times_of_day")

#Revising Binary Features....
df.loc[df.is_price>1,"is_price"] = np.nan
df = df.drop(columns=['campaign_id','is_timer'])
df = df.dropna()
df.head()

Unnamed: 0,sender,subject_len,body_len,mean_paragraph_len,day_of_week,is_weekend,times_of_day_evening,times_of_day_morning,category,product,...,mean_CTA_len,is_image,is_personalised,is_quote,is_emoticons,is_discount,is_price,is_urgency,target_audience,click_rate
0,3,76,10439,39,5,1,0,0,6,26,...,29,0,0,0,0,0,0.0,0,14,0.103079
1,3,54,2570,256,5,1,0,1,2,11,...,22,0,0,0,0,0,0.0,0,10,0.7
2,3,59,12801,16,5,1,0,0,2,11,...,23,1,0,1,0,0,0.0,0,16,0.002769
3,3,74,11037,30,4,0,1,0,15,9,...,24,0,0,0,0,0,0.0,0,10,0.010868
4,3,80,10011,27,5,1,0,0,6,26,...,31,0,0,1,0,0,0.0,0,14,0.142826


**Writting Custom Model Class**

<hr>Note : This is Ensemble Model based on our Manual Feature Analysis Performed by Inspecting Excel.<br><hr><br>

In [9]:
class MaxEnsemble():
    def __init__(self):
        """
        Generate The MaxEnsemble Class Object which carries ensemble of models.
        """
        # Model for Product==9 Mask
        self.product_9 = RandomForestRegressor(n_estimators=100,min_samples_leaf=3,min_samples_split=3,max_depth=20,max_features=5)
        # Model for Product==34 Mask
        self.product_34 = RandomForestRegressor(n_estimators=100,min_samples_leaf=3,min_samples_split=3,max_depth=20,max_features=5)
        # Model for Product==1 Mask
        self.product_1 = RandomForestRegressor(n_estimators=100,min_samples_leaf=3,min_samples_split=3,max_depth=10,max_features=5)
        # Model for Product==13 Mask
        self.product_13 = RandomForestRegressor(n_estimators=100,min_samples_leaf=3,min_samples_split=3,max_depth=10,max_features=3)
        # Model for All Other Mask
        self.product_all = AdaBoostRegressor(base_estimator=RandomForestRegressor(n_estimators=100),n_estimators=20)
    
    def fit(self,x,y):
        """
        Train The Ensemble of Models with the Dataset using Dask Framework.
        """
        mask = (x.loc[:,"product"] ==9)
        x9,y9 = x[mask],y[mask]
        
        mask = (x.loc[:,"product"] ==34)
        x34,y34 = x[mask],y[mask]
        
        mask = (x.loc[:,"product"] ==1)
        x1,y1 = x[mask],y[mask]
        
        mask = (x.loc[:,"product"] ==13)
        x13,y13 = x[mask],y[mask]
        
        x_all,y_all = x,y
        
        # Creating Dask Graph to Train The Model...
        d1=dask.delayed()(self.product_9.fit)(x9,y9) 
        d2=dask.delayed()(self.product_34.fit)(x34,y34) 
        d3=dask.delayed()(self.product_1.fit)(x1,y1) 
        d4=dask.delayed()(self.product_13.fit)(x13,y13) 
        d5=dask.delayed()(self.product_all.fit)(x_all,y_all) 
        dask.persist(d1,d2,d3,d4,d5)
    
    def predict(self,x):
        """
        Get Predictions based on Input from all the Ensemble Models
        """
        x = x.copy()
        x.loc[:,"click_rate"] = self.product_all.predict(x)
        
        mask = (x.loc[:,"product"] ==9)
        if len(x[mask]) > 0 :
            x.loc[mask,"click_rate"] = self.product_9.predict(x[mask].iloc[:,:-1])
        
        mask = (x.loc[:,"product"] ==34)
        if len(x[mask]) > 0 :
            x.loc[mask,"click_rate"] = self.product_34.predict(x[mask].iloc[:,:-1])
        
        mask = (x.loc[:,"product"] ==13)
        if len(x[mask]) > 0 :
            x.loc[mask,"click_rate"] = self.product_13.predict(x[mask].iloc[:,:-1])
        
        mask = (x.loc[:,"product"] ==1)
        if len(x[mask]) > 0 :
            x.loc[mask,"click_rate"] = self.product_1.predict(x[mask].iloc[:,:-1])
        
        return x.click_rate

**Get Cross Validation R2 Score over MaxEnsemble**

In [7]:
def get_score(df):
    model = MaxEnsemble()

    # Train Test Split...
    df = df.sample(frac=1.0)
    split = int(0.8*len(df))
    x_train,y_train = df.iloc[:split,:-1],df.iloc[:split,-1]
    x_val,y_val = df.iloc[split:,:-1],df.iloc[split:,-1]

    # Training The Model...
    model.fit(x_train,y_train)
    
    ypred = model.predict(x_val).values
    ytrue = y_val.values
    return r2_score(ytrue,ypred)

In [8]:
scores = []
for i in range(10):
    scores.append(get_score(df))
print(np.mean(scores))

0.5853389827281978


**Training The Model on Entire Dataset**

In [11]:
%%time
model = MaxEnsemble()
model.fit(df.iloc[:,:-1],df.iloc[:,-1])

CPU times: user 16.9 s, sys: 122 ms, total: 17 s
Wall time: 16.4 s


**Saving The Solutions**

In [14]:
# Preprocessing For Testing Data....

df_test = pd.read_csv("./Datasets/test.csv")
# Converting Column to One Hot Encoding...
df_test = get_one_hot_df(df_test,"times_of_day")
df_test = df_test.drop(columns=['is_timer'])
df_test.head()

Unnamed: 0,campaign_id,sender,subject_len,body_len,mean_paragraph_len,day_of_week,is_weekend,times_of_day_evening,times_of_day_morning,category,...,no_of_CTA,mean_CTA_len,is_image,is_personalised,is_quote,is_emoticons,is_discount,is_price,is_urgency,target_audience
0,1889,3,61,12871,11,6,1,0,0,6,...,3,37,1,0,0,1,0,0,0,14
1,1890,3,54,2569,256,5,1,0,1,2,...,0,22,0,0,0,0,0,0,0,10
2,1891,3,88,1473,78,4,0,1,0,2,...,0,22,1,0,1,0,0,0,0,10
3,1892,3,88,1473,78,3,0,1,0,2,...,0,22,1,0,1,0,0,0,0,10
4,1893,3,78,9020,29,3,0,0,0,6,...,2,40,0,0,0,0,0,0,0,14


In [15]:
# Saving Output to a CSV File

df_test.loc[:,"click_rate"]=model.predict(df_test.iloc[:,1:])
solution = df_test.loc[:,["campaign_id","click_rate"]]
solution.to_csv("./Datasets/solution_max_ensemble.csv",index=False)