# Final Model : Taking Solution File From Previous MaxEnsemble Model
<hr><br>

**Importing Importants Libraries**

In [1]:
import numpy as np
import pandas as pd

from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor

from xgboost import XGBRegressor
from scipy import stats
import dask
import dill

ModuleNotFoundError: No module named 'numpy'

**Importing Train and Test Datasets with Solution File From Max Ensemble Model**

In [3]:
df_train = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")
sample = pd.read_csv("./solution_max_ensemble.csv")
df_test = pd.merge(left=df_test,right=sample,on=['campaign_id'])

df_train.head()

Unnamed: 0,campaign_id,sender,subject_len,body_len,mean_paragraph_len,day_of_week,is_weekend,times_of_day,category,product,...,is_image,is_personalised,is_quote,is_timer,is_emoticons,is_discount,is_price,is_urgency,target_audience,click_rate
0,1,3,76,10439,39,5,1,Noon,6,26,...,0,0,0,0,0,0,0,0,14,0.103079
1,2,3,54,2570,256,5,1,Morning,2,11,...,0,0,0,0,0,0,0,0,10,0.7
2,3,3,59,12801,16,5,1,Noon,2,11,...,1,0,1,0,0,0,0,0,16,0.002769
3,4,3,74,11037,30,4,0,Evening,15,9,...,0,0,0,0,0,0,0,0,10,0.010868
4,5,3,80,10011,27,5,1,Noon,6,26,...,0,0,1,0,0,0,0,0,14,0.142826


**Generating Mask and Adding into DF**
<hr>

<b>Note:</b> <font color="red">Now We are going to append some rows from test-set by using max_ensemble_solution file by analysing which subsets gives r2 over 0.90. </font>

<b>Mask with R2 Values : </b><br>
<hr>
1. is_weekend == 1 (0.88)<br>
2. day_of_week==2 (0.90)<br>
3. day_of_week==6 (0.91)<br>
4. category==15 (0.91)<br>

In [4]:
mask = (df_test.is_weekend == 1) | (df_test.day_of_week==2) | (df_test.day_of_week==6) | (df_test.category==15) | (df_test.category==2)
df=pd.concat([df_train,df_test[mask]],axis=0).sample(frac=1.0).reset_index(drop=True)
df.head()

Unnamed: 0,campaign_id,sender,subject_len,body_len,mean_paragraph_len,day_of_week,is_weekend,times_of_day,category,product,...,is_image,is_personalised,is_quote,is_timer,is_emoticons,is_discount,is_price,is_urgency,target_audience,click_rate
0,2554,3,55,16046,16,6,1,Evening,15,9,...,2,0,0,0,0,0,0,1,12,0.028443
1,112,3,80,17347,15,0,0,Evening,1,5,...,0,0,0,0,1,0,0,0,7,0.0049
2,239,3,65,12934,19,2,0,Evening,2,11,...,1,0,1,0,0,0,0,0,10,0.017537
3,1230,3,60,11379,21,6,1,Noon,9,34,...,1,0,0,0,0,0,0,0,12,0.029618
4,620,3,92,12689,23,1,0,Evening,1,5,...,0,0,1,0,2,0,0,0,12,0.012182


**Data Preprocessing**

In [309]:
def get_one_hot_df(df,column):
    # Converting Column to One Hot Encoding...
    column_id = list(df.columns).index(column)
    one_hot = pd.get_dummies(df.loc[:,column]).iloc[:,:-1]
    one_hot.columns = [column+"_"+str(x).lower() for x in one_hot.columns]

    df = pd.concat([ df.iloc[:,:column_id], one_hot , df.iloc[:,(column_id+1):]],axis=1)
    return df

In [310]:
# Converting Column to One Hot Encoding...
df = get_one_hot_df(df,"times_of_day")

#Revising Binary Features....
df.loc[df.is_price>1,"is_price"] = np.nan
df = df.drop(columns=['campaign_id','is_timer'])
df = df.dropna()
#df = df[(np.abs(stats.zscore(df)) < 3.5).all(axis=1)]
df.head()

Unnamed: 0,sender,subject_len,body_len,mean_paragraph_len,day_of_week,is_weekend,times_of_day_evening,times_of_day_morning,category,product,...,mean_CTA_len,is_image,is_personalised,is_quote,is_emoticons,is_discount,is_price,is_urgency,target_audience,click_rate
0,3,110,10593,42,0,0,1,0,15,9,...,27,0,0,0,1,0,0.0,0,16,0.005396
1,10,49,10279,36,1,0,1,0,13,21,...,23,1,0,1,0,0,0.0,0,6,0.052405
2,10,57,9053,39,0,0,1,0,15,1,...,10,1,1,3,0,0,0.0,0,12,0.003136
3,3,63,1086,167,5,1,0,1,15,1,...,19,0,0,0,0,0,0.0,0,2,0.074946
4,3,54,1576,87,3,0,1,0,9,34,...,0,0,1,3,0,0,0.0,0,6,0.0


**Applying Cross Validations**

In [311]:
@dask.delayed
def check_cross_val(model,df):
    model = clone(model)
    df = df.sample(frac=1.0)
    split = int(0.8*len(df))

    x_train,y_train = df.iloc[:split,:-1],df.iloc[:split,-1]
    x_val,y_val = df.iloc[split:,:-1],df.iloc[split:,-1]
    model.fit(x_train,y_train)
    score = model.score(x_val,y_val)
    return score

**Building Model & Train Test Split**

In [312]:
model = ExtraTreesRegressor(n_estimators=10,max_depth=25,min_samples_leaf=4,min_samples_split=5)
model = AdaBoostRegressor(base_estimator=model,n_estimators=10)

In [313]:
df = df.sample(frac=1.0)
split = int(0.8*len(df))

x_train,y_train = df.iloc[:split,:-1],df.iloc[:split,-1]
x_val,y_val = df.iloc[split:,:-1],df.iloc[split:,-1]
model.fit(x_train,y_train)

In [314]:
model.score(x_train,y_train)

0.9496803649529061

In [315]:
model.score(x_val,y_val)

0.6966718616180428

**Cross-Validation Score using Dask Framework**

In [None]:
%%time
scores=[ check_cross_val(model,df) for i in range(20)]
scores = [score.compute() for score in dask.persist(*scores)]
print(np.mean(scores))

**Training The Model on Entire Corpus**

In [317]:
model.fit(df.iloc[:,:-1],df.iloc[:,-1])

**Saving The Outputs**

In [318]:
df_test = pd.read_csv("./Datasets/test.csv")
# Converting Column to One Hot Encoding...
df_test = get_one_hot_df(df_test,"times_of_day")
df_test = df_test.drop(columns=['is_timer'])

df_test.head()

Unnamed: 0,campaign_id,sender,subject_len,body_len,mean_paragraph_len,day_of_week,is_weekend,times_of_day_evening,times_of_day_morning,category,...,no_of_CTA,mean_CTA_len,is_image,is_personalised,is_quote,is_emoticons,is_discount,is_price,is_urgency,target_audience
0,1889,3,61,12871,11,6,1,0,0,6,...,3,37,1,0,0,1,0,0,0,14
1,1890,3,54,2569,256,5,1,0,1,2,...,0,22,0,0,0,0,0,0,0,10
2,1891,3,88,1473,78,4,0,1,0,2,...,0,22,1,0,1,0,0,0,0,10
3,1892,3,88,1473,78,3,0,1,0,2,...,0,22,1,0,1,0,0,0,0,10
4,1893,3,78,9020,29,3,0,0,0,6,...,2,40,0,0,0,0,0,0,0,14


In [319]:
df_test.loc[:,"click_rate"]=model.predict(df_test.iloc[:,1:])
solution = df_test.loc[:,["campaign_id","click_rate"]]
solution.to_csv("./solution.csv",index=False)