In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# Loading Training and Testing Data

## Train Data

In [None]:
df_train = pd.read_csv("../data/Email/Email_train.csv")
pd.set_option('display.max_columns', None)
df_train.head()

In [None]:
df_train.shape

In [None]:
df_train.info()

In [None]:
# Type of data present
df_train.dtypes

In [None]:
df_train.describe()

In [None]:
sns.pairplot(df_train)

In [None]:
df_train.ndim

In [None]:
df_train.columns

In [None]:
df_train.size

In [None]:
# Null values
df_train.isna().sum()

### No null values are present we can proceed with feature engineering

In [None]:
df_train

Removing the column campaign_id as it's just an id of the campaign which is not needed with the data

In [None]:
df_train.drop('campaign_id', axis=1, inplace=True)

In [None]:
df_train

There is only one column with categorical data converting the data to numerical using encoding method

In [None]:
from sklearn.preprocessing import OneHotEncoder
le = OneHotEncoder()

In [None]:
df_train['times_of_day'] = le.fit_transform(df_train['times_of_day'])

In [None]:
df_train

## Analysing

In [None]:
plt.scatter(df_train['times_of_day'], df_train['click_rate'])
plt.xlabel('Day')
plt.ylabel('Click Rate');

In [None]:
sns.regplot(x='subject_len', y='click_rate', data=df_train)
plt.xlabel("Subject Text length")
plt.ylabel("Click Rate")
plt.title("Customer Click Rate according to the length of Subject");

As subject text length increases the click rate decreases

In [None]:
sns.regplot(x='body_len', y='click_rate', data=df_train)
plt.xlabel("Body length")
plt.ylabel("Click Rate")
plt.title("Customer Click Rate with respect to Body length");

As body length increases the click rate decreases

In [None]:
sns.regplot(x='product', y='click_rate', data=df_train)
plt.xlabel("Product")
plt.ylabel("Click Rate")
plt.title("Customer Click Rate with respect to Product");

There is an increase in click rate as their is more product is advertised

In [None]:
sns.regplot(x='times_of_day', y='click_rate', data=df_train)
plt.xlabel("Time of Day")
plt.ylabel("Click Rate")
plt.title("Customer Click Rate with respect to Time of Day");

In evening users are more likely to click on the advertised mail

In [None]:
sns.regplot(x='category', y='click_rate', data=df_train)
plt.xlabel("Category")
plt.ylabel("Click Rate")
plt.title("Customer Click Rate with respect to Category");

With category also as categories increases click rate decreases.

In [None]:
sns.regplot(x='is_weekend', y='click_rate', data=df_train)
plt.xlabel("Weekend")
plt.ylabel("Click Rate")
plt.title("Customer Click Rate with respect to Weekend");

On weekdays users are more likely to click on the email 

In [None]:
sns.regplot(x='target_audience', y='click_rate', data=df_train)
plt.xlabel("Audenice")
plt.ylabel("Click Rate")
plt.title("Customer Click Rate with respect to Audience");

There is an increase in click rate as our target audience increases

From the above graphs we can conclude that if we increase our target audience, send more mails on weekdays, send more mails in evening, increase the number of products advertised and reduce the categories we can increase the email click rate.

## Modelling

In [None]:
# Dividing independant and dependant features
X = df_train.iloc[:, :-1]
y = df_train.iloc[:, -1]

In [None]:
X

In [None]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_train

In [None]:
X_test

In [None]:
# Standardize the dataset
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
# Fitting and transforming the X_train
X_train = scaler.fit_transform(X_train)

In [None]:
# Transforming the X_train
X_test = scaler.transform(X_test)

In [None]:
import pickle
pickle.dump(scaler, open('scaling.pkl', 'wb'))

In [None]:
X_train

In [None]:
X_test

### Model Training

In [None]:
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor

In [None]:
models = {"adaboost": LinearRegression(),
         "bagging": ARDRegression(),
         "huberregression": HuberRegressor(),
         "posionregression": PoissonRegressor(),
         "sgdregression": SGDRegressor()}

def fit_and_score(models, X_train, X_test, y_train, y_test):
    """
    Fits and evaluates given machine learning models.
    model : a dict of different Scikit-Learn machine learning models.
    X_train: Training data (No labels)
    X_test: Testing data (No labels)
    y_train: training labels
    y_test: training labels
    """
    # Set random seed
    np.random.seed(42)
    # Make a dictionary to keep model scores
    model_scores = {}
    # Loop through models
    for name, model in models.items():
        # Fit the model to the data
        model.fit(X_train, y_train)
        # Evaluate the model and append its score to model_scores
        # Here name is the ML model name and "model" is actual ML model
        model_scores[name] = model.score(X_test, y_test)
    return model_scores


In [None]:
model_score = fit_and_score(models = models,
                           X_train = X_train,
                           X_test = X_test,
                           y_train = y_train,
                           y_test = y_test)

model_score

In [None]:
# Coefficient
poisson.coef_

In [None]:
print(poisson.intercept_)

In [None]:
## On which parameters model is trained
poisson.get_params()

In [None]:
## Prediction with test data
reg_pred = poisson.predict(X_test)

In [None]:
reg_pred

In [None]:
## Scatter plot for prediction
plt.scatter(y_test, reg_pred)

In [None]:
# Errors
residual = y_test - reg_pred
residual

In [None]:
## Prediction with residuals
sns.displot(residual, kind="kde");

In [None]:
# reg_pred Scatter plot with respect to residual
plt.scatter(reg_pred, residual);

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
print(mean_absolute_error(y_test, reg_pred))
print(mean_squared_error(y_test, reg_pred))
print(np.sqrt(mean_squared_error(y_test, reg_pred)))

In [None]:
from sklearn.metrics import r2_score
score = r2_score(y_test, reg_pred)
print(score)