In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
def clean_value(value):
    if value != 0:
        return 1
    return 0

In [4]:
train['is_quote'].unique()

array([0, 1, 2, 3, 4, 6, 5], dtype=int64)

In [5]:
train['sender'] = train['sender'].map(train['sender'].value_counts().to_dict())
train['times_of_day'] = train['times_of_day'].replace({'Morning': 0, 'Noon':1, 'Evening': 2})
train['is_image'] = train['is_image'].apply(clean_value)
# train['is_emoticons'] = train['is_emoticons'].map(train['is_emoticons'].value_counts().to_dict())
train['is_emoticons'] = train['is_emoticons'].apply(clean_value)
train['is_price'] = train['is_price'].apply(clean_value)
train['is_quote'] = train['is_quote'].apply(clean_value)

In [6]:
dum1=pd.get_dummies(train['sender'],prefix='sender')
dum2=pd.get_dummies(train['times_of_day'],prefix='times_of_day')
dum3=pd.get_dummies(train['is_image'],prefix='is_image')
dum4=pd.get_dummies(train['is_emoticons'],prefix='is_emoticons')
dum5=pd.get_dummies(train['is_price'],prefix='is_price')
dum6=pd.get_dummies(train['day_of_week'],prefix='day_of_week')
dum7=pd.get_dummies(train['is_weekend'],prefix='is_weekend')
dum8=pd.get_dummies(train['category'],prefix='category')
dum9=pd.get_dummies(train['product'],prefix='product')
dum10=pd.get_dummies(train['is_personalised'],prefix='is_personalised')
dum11=pd.get_dummies(train['is_quote'],prefix='is_quote')

train = pd.concat([train,dum1,dum2,dum3,dum4, dum5, dum6, dum7, dum8, dum9, dum10, dum11],axis=1)

In [18]:
test1=pd.get_dummies(train['sender'],prefix='sender')
test2=pd.get_dummies(train['times_of_day'],prefix='times_of_day')
test3=pd.get_dummies(train['is_image'],prefix='is_image')
test4=pd.get_dummies(train['is_emoticons'],prefix='is_emoticons')
test5=pd.get_dummies(train['is_price'],prefix='is_price')
test6=pd.get_dummies(train['day_of_week'],prefix='day_of_week')
test7=pd.get_dummies(train['is_weekend'],prefix='is_weekend')
test8=pd.get_dummies(train['category'],prefix='category')
test9=pd.get_dummies(train['product'],prefix='product')
test10=pd.get_dummies(train['is_personalised'],prefix='is_personalised')
test11=pd.get_dummies(train['is_quote'],prefix='is_quote')

test = pd.concat([train,test1,test2,test3,test4, test5, test6, test7, test8, test9, test10, test11],axis=1)

In [19]:
pprint(train.columns[0:22])

Index(['campaign_id', 'sender', 'subject_len', 'body_len',
       'mean_paragraph_len', 'day_of_week', 'is_weekend', 'times_of_day',
       'category', 'product', 'no_of_CTA', 'mean_CTA_len', 'is_image',
       'is_personalised', 'is_quote', 'is_timer', 'is_emoticons',
       'is_discount', 'is_price', 'is_urgency', 'target_audience',
       'click_rate'],
      dtype='object')


In [20]:
pprint(train.columns[22:])

Index(['sender_1', 'sender_2', 'sender_3', 'sender_7', 'sender_13',
       'sender_22', 'sender_89', 'sender_181', 'sender_1559', 'times_of_day_0',
       'times_of_day_1', 'times_of_day_2', 'is_image_0', 'is_image_1',
       'is_emoticons_0', 'is_emoticons_1', 'is_price_0', 'is_price_1',
       'day_of_week_0', 'day_of_week_1', 'day_of_week_2', 'day_of_week_3',
       'day_of_week_4', 'day_of_week_5', 'day_of_week_6', 'is_weekend_0',
       'is_weekend_1', 'category_0', 'category_1', 'category_2', 'category_3',
       'category_4', 'category_5', 'category_6', 'category_7', 'category_8',
       'category_9', 'category_10', 'category_11', 'category_12',
       'category_13', 'category_14', 'category_15', 'product_0', 'product_1',
       'product_2', 'product_3', 'product_4', 'product_5', 'product_6',
       'product_7', 'product_8', 'product_9', 'product_10', 'product_11',
       'product_12', 'product_13', 'product_14', 'product_15', 'product_16',
       'product_17', 'product_18', 'pr

In [21]:
feature_columns = ['subject_len', 'body_len',
       'mean_paragraph_len', 'no_of_CTA', 'mean_CTA_len', 'target_audience',
              'is_urgency'] + train.columns[22:].tolist()

target_column = 'click_rate'

In [22]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from lightgbm import LGBMRegressor as LGR
from xgboost import XGBRegressor as XGR
from pprint import pprint
from sklearn.linear_model import LinearRegression as LR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.ensemble import GradientBoostingRegressor as GBR
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import r2_score

In [23]:
## make the model as per specifications
def make_model(xtr,ytr,model_type):
    if model_type == 'GBR':
        clf = GBR(random_state=42)
    elif model_type == 'RFR':
        clf = RFR(random_state=42)
    elif model_type == 'LR':
        clf = LR()
    elif model_type == 'XGBOOST':
        clf = XGR(random_state=42)
    elif model_type == 'LIGHTGBM':
        clf = LGR(random_state=42)
    elif model_type == 'DTR':
        clf = DTR()
    
    clf.fit(xtr,ytr)
    return clf

In [24]:
def get_basemodels(X_train, X_test, y_train, y_test):
    
    models = ['LR','GBR','LIGHTGBM', 'RFR', 'XGBOOST', 'DTR']
    
    model_names = []
    trained_models = []
    r2_train_scores = []
    r2_test_scores = []
    ## iterate over model type and the dependent variable
    for model in models:
        clf = make_model(X_train, y_train, model)
        
        y_train_predictions = clf.predict(X_train)
        model_names.append(model)
        trained_models.append(clf)
        r2_train_scores.append(r2_score(y_train_predictions, y_train))
        
        y_test_predictions = clf.predict(X_test)
        r2_test_scores.append(r2_score(y_test_predictions, y_test ))
    ## collating results
    error_df = pd.DataFrame()
    error_df['model_type'] = model_names
    error_df['trained_models'] = trained_models
    error_df['train_r2_score'] = r2_train_scores
    error_df['test_r2_score'] = r2_test_scores
    
    return trained_models, error_df

In [28]:
X_train, X_test, y_train, y_test = train_test_split(train[feature_columns], train[target_column], 
                                    test_size=0.30, random_state=42)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_new_test_scaled = scaler.transform(test[feature_columns])

ValueError: X has 187 features, but StandardScaler is expecting 97 features as input.

In [26]:
models, r2_scores = get_basemodels(X_train_scaled, X_test_scaled, y_train, y_test)
r2_scores

Unnamed: 0,model_type,trained_models,train_r2_score,test_r2_score
0,LR,LinearRegression(),-1.172825,-0.000332
1,GBR,([DecisionTreeRegressor(criterion='friedman_ms...,0.487294,-0.04018
2,LIGHTGBM,LGBMRegressor(random_state=42),0.75742,0.412647
3,RFR,"(DecisionTreeRegressor(max_features='auto', ra...",0.893201,0.258049
4,XGBOOST,"XGBRegressor(base_score=0.5, booster='gbtree',...",0.994734,0.324036
5,DTR,DecisionTreeRegressor(),1.0,0.213711


In [27]:
test[feature_columns]

Unnamed: 0,subject_len,body_len,mean_paragraph_len,no_of_CTA,mean_CTA_len,target_audience,is_urgency,sender_1,sender_1.1,sender_2,...,product_43,product_43.1,is_personalised_0,is_personalised_0.1,is_personalised_1,is_personalised_1.1,is_quote_0,is_quote_0.1,is_quote_1,is_quote_1.1
0,76,10439,39,3,29,14,0,0,0,0,...,0,0,1,1,0,0,1,1,0,0
1,54,2570,256,0,22,10,0,0,0,0,...,0,0,1,1,0,0,1,1,0,0
2,59,12801,16,3,23,16,0,0,0,0,...,0,0,1,1,0,0,0,0,1,1
3,74,11037,30,4,24,10,0,0,0,0,...,0,0,1,1,0,0,1,1,0,0
4,80,10011,27,3,31,14,0,0,0,0,...,0,0,1,1,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1883,88,1451,75,0,22,10,0,0,0,0,...,0,0,1,1,0,0,0,0,1,1
1884,58,10537,40,5,27,11,0,0,0,0,...,0,0,1,1,0,0,1,1,0,0
1885,89,11050,26,4,28,6,0,0,0,0,...,0,0,1,1,0,0,1,1,0,0
1886,58,10537,40,5,27,16,0,0,0,0,...,0,0,1,1,0,0,1,1,0,0


In [None]:
predictions = pd.read_csv('sample_submission.csv')
predictions['click_rate'] = models[-2].predict(X_unknown_scaled)
predictions.to_csv('2022 08 07 submission_v2.csv', index=False)

In [None]:
import keras
from keras.models import Sequential 
from keras.layers import Dense
from keras.layers import LeakyReLU, PReLU, ELU
from keras.layers import Dropout

In [None]:
model = Sequential()
model.add(Dense(200, input_dim=17, activation='relu'))
model.add(Dense(200, activation='relu'))
model.add(Dense(200, activation='relu'))
model.add(Dense(1, activation='linear'))

In [None]:
keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False)
model.compile(loss='mean_squared_error', optimizer='RMSprop', metrics=['mean_absolute_percentage_error'])

In [None]:
#To visualize neural network
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=2000, batch_size=32,validation_split=0.15,validation_data=None,verbose=1)

In [None]:
regressor = Sequential()

regressor.add(Dense(units=50, activation='relu', input_dim=17))

regressor.add(Dense(units=25, activation='relu'))

regressor.add(Dense(units=50, activation='relu'))

regressor.add(Dense(units=1 ))

regressor.compile(loss=r2_score, optimizer = 'Adamax')

model_history = regressor.fit(X_train_scaled, y_train,
                             validation_split=0.20, batch_size=10, epochs=1000)
