# 1 import necessary lib

In [1]:
import os
import random
import pandas as pd
import numpy as np
import datetime
import setuptools
import time
from scipy.stats import kurtosis, skew  # it's to explore some statistics of numerical value
import json  # to convert json in df
from pandas.io.json import json_normalize

import matplotlib.pyplot as plt  # to graphics plot
import seaborn as sns  # a good library to graphic plots
color = sns.color_palette()
import squarify  # to better understand proportion of categorys - it's a treemap layout algorithm

# Importing librarys to use on interactive graphs
from plotly.offline import init_notebook_mode, iplot, plot
%matplotlib inline
from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

# machine learning models
from sklearn import model_selection, preprocessing, metrics
import lightgbm as lgb
from scipy.stats import kurtosis, skew  # it's to explore some statistics of numerical value

plt.style.use('fivethirtyeight') 

### constant

In [2]:
DATA_PATH = "../input/"
PROCESS_DATA_PERCENT = 1  # fractional number to skip rows and read just a random sample of the our dataset.
JSON_COLS = ['device', 'geoNetwork', 'totals', 'trafficSource']  # Columns that have json format

# 2. Data Pre-processing

## 2.1 load csv, parse JSON

### sub functions: read json file, process json data, drop data, change data type, encode category to number

In [3]:
def json_read(df):
    # joining the [ path + df received]
    data_frame = DATA_PATH + df

    df = pd.read_csv(data_frame,
                     converters={column: json.loads for column in JSON_COLS},  # loading the json columns properly
                     dtype={'fullVisitorId': 'str'},  # transforming this column to string
                     skiprows=lambda i: i > 0 and random.random() > PROCESS_DATA_PERCENT,
                     )  # Number of rows that will be imported randomly
    return df

def process_json_data(df):
    for column in JSON_COLS:
        # It will normalize and set the json to a table
        column_as_df = json_normalize(df[column])
        # here will be set the name using the category and subcategory of json columns
        column_as_df.columns = [f'{column}{subcolumn}' for subcolumn in column_as_df.columns]
        column_as_df.columns = [subcolumn.replace('.', '') for subcolumn in column_as_df.columns]
        # after extracting the values, let drop the original columns
        df = df.drop(column, axis=1, errors='ignore').merge(column_as_df, right_index=True, left_index=True)
        # print(list(column_as_df.columns.values))
    # print(f'Shape: {df.shape}')
    return df


def missing_value_info(data):
    columns = data.columns[data.isnull().any()].tolist()
    # getting the sum of null values and ordering
    total = data.isnull().sum().sort_values(ascending=False)
    # getting the percent and order of null
    percent = (data.isnull().sum() / data.isnull().count() * 100).sort_values(ascending=False)
    percent_list = percent.tolist()
    percent_label_list = percent.index.tolist()

    df = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

    # print('Total columns at least one Values: ')
    # print(df[~(df['Total'] == 0)])  # Returning values of nulls different of 0
    #
    # print('\n Total of Sales % of Total: ', round((df_train[df_train['totalstransactionRevenue'] != np.nan][
    #      'totalstransactionRevenue'].count() / len(df_train['totalstransactionRevenue']) * 100), 4))

    tuple_data = [(i, j) for (i, j) in zip(percent_list, percent_label_list) if i > 0]
    percent_list, percent_label_list = [list(c) for c in zip(*tuple_data)]
    percent_list_size = len(percent_list)
    ind = np.arange(percent_list_size)
    plt.bar(ind, percent_list, width=0.3)
    plt.xticks(ind, percent_label_list, rotation=90)
    plt.show()

    return columns

def constant_value_info(train_df):
    const_cols = [c for c in train_df.columns if train_df[c].nunique(dropna=False) == 1]
    return const_cols


def drop_features(df):
    to_drop = ['sessionId', 'socialEngagementType', 'devicebrowserVersion', 'devicebrowserSize', 'deviceflashVersion',
               'devicelanguage',
               'devicemobileDeviceBranding', 'devicemobileDeviceInfo', 'devicemobileDeviceMarketingName',
               'devicemobileDeviceModel',
               'devicemobileInputSelector', 'deviceoperatingSystemVersion', 'devicescreenColors',
               'devicescreenResolution',
               'geoNetworkcityId', 'geoNetworklatitude', 'geoNetworklongitude', 'geoNetworknetworkLocation',
               'trafficSourceadwordsClickInfocriteriaParameters', 'trafficSourceadwordsClickInfogclId',
               'trafficSourcecampaign',
               'trafficSourceadwordsClickInfopage', 'trafficSourcereferralPath',
               'trafficSourceadwordsClickInfoslot',
               'trafficSourceadContent', 'trafficSourcekeyword', 'trafficSourceadwordsClickInfoadNetworkType',
               'totalsbounces', 'totalsnewVisits', 'totalsvisits',
               'trafficSourceisTrueDirect',
               'trafficSourceadwordsClickInfoisVideoAd', 'totalsvisits']
    df.drop(to_drop, axis=1, errors='ignore', inplace=True)
    if 'trafficSourcecampaignCode' in df.columns:
        df.drop(['trafficSourcecampaignCode'], axis=1, errors='ignore', inplace=True)
    return df


def change_feature_type_and_fill_na(df):
    if 'geoNetworkcity' in df.columns:
        df.loc[df['geoNetworkcity'] == '(not set)', 'geoNetworkcity'] = np.nan
        df.loc[df['geoNetworkcity'] == 'not available in demo dataset', 'geoNetworkcity'] = np.nan
        df['geoNetworkcity'].fillna('NaN', inplace=True)

    if 'geoNetworkmetro' in df.columns:
        df.loc[df['geoNetworkmetro'] == '(not set)', 'geoNetworkmetro'] = np.nan
        df.loc[df['geoNetworkmetro'] == 'not available in demo dataset', 'geoNetworkmetro'] = np.nan
        df['geoNetworkmetro'].fillna('NaN', inplace=True)

    if 'geoNetworknetworkDomain' in df.columns:
        df.loc[df['geoNetworknetworkDomain'] == 'not available in demo dataset', 'geoNetworknetworkDomain'] = np.nan
        df['geoNetworknetworkDomain'].fillna('NaN', inplace=True)

    if 'geoNetworkregion' in df.columns:
        df.loc[df['geoNetworkregion'] == 'not available in demo dataset', 'geoNetworkregion'] = np.nan
        df['geoNetworkregion'].fillna('NaN', inplace=True)

    if 'totalshits' in df.columns:
        df['totalshits'] = df['totalshits'].astype(int)  # setting numerical to int

    if 'totalspageviews' in df.columns:
        df['totalspageviews'].fillna(1, inplace=True)  # filling NA's with 1
        df['totalspageviews'] = df['totalspageviews'].astype(int)  # setting numerical column as integer

    if 'totalstransactionRevenue' in df.columns:
        df['totalstransactionRevenue'] = df['totalstransactionRevenue'].fillna(0.0).astype(float)

    return df


def category_to_number(train, test):
    cat_cols = ['channelGrouping', 'devicebrowser',
                'devicedeviceCategory', 'deviceoperatingSystem',
                'geoNetworkcity', 'geoNetworkcontinent',
                'geoNetworkcountry', 'geoNetworkmetro',
                'geoNetworknetworkDomain', 'geoNetworkregion',
                'geoNetworksubContinent',
                'trafficSourcemedium',
                'trafficSourcesource']

    for col in cat_cols:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[col].values.astype('str')) + list(test[col].values.astype('str')))
        train[col] = lbl.transform(list(train[col].values.astype('str')))
        test[col] = lbl.transform(list(test[col].values.astype('str')))

    return train, test

def process_data(raw_data):
    print('raw_data shape ' + str(raw_data.shape))
    data_processed_json = process_json_data(raw_data)
    print('data_processed_json shape ' + str(data_processed_json.shape))
    
    # todo missing and constant
    
    
    data_dropped_feature = drop_features(data_processed_json)
    print('data_dropped_feature shape ' + str(data_dropped_feature.shape))
    data_filled_na = change_feature_type_and_fill_na(data_dropped_feature)
    print('data_filled_na shape ' + str(data_filled_na.shape))
    return data_filled_na




## train_csv

In [4]:
data_train_raw = json_read('train.csv')
processed_train_data = process_data(data_train_raw)
print(processed_train_data.info())
processed_train_data.to_csv('train_concise.csv', index=False)
del data_train_raw, processed_train_data

raw_data shape (903653, 12)
data_processed_json shape (903653, 55)
data_dropped_feature shape (903653, 22)
data_filled_na shape (903653, 22)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 22 columns):
channelGrouping             903653 non-null object
date                        903653 non-null int64
fullVisitorId               903653 non-null object
visitId                     903653 non-null int64
visitNumber                 903653 non-null int64
visitStartTime              903653 non-null int64
devicebrowser               903653 non-null object
devicedeviceCategory        903653 non-null object
deviceisMobile              903653 non-null bool
deviceoperatingSystem       903653 non-null object
geoNetworkcity              903653 non-null object
geoNetworkcontinent         903653 non-null object
geoNetworkcountry           903653 non-null object
geoNetworkmetro             903653 non-null object
geoNetworknetworkDomain     903653 non-n

## test.csv

In [5]:
data_test_raw = json_read('test.csv')
processed_test_data = process_data(data_test_raw)
print(processed_test_data.info())
processed_test_data.to_csv('test_concise.csv', index=False)
del data_test_raw, processed_test_data

raw_data shape (804684, 12)
data_processed_json shape (804684, 53)
data_dropped_feature shape (804684, 21)
data_filled_na shape (804684, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804684 entries, 0 to 804683
Data columns (total 21 columns):
channelGrouping            804684 non-null object
date                       804684 non-null int64
fullVisitorId              804684 non-null object
visitId                    804684 non-null int64
visitNumber                804684 non-null int64
visitStartTime             804684 non-null int64
devicebrowser              804684 non-null object
devicedeviceCategory       804684 non-null object
deviceisMobile             804684 non-null bool
deviceoperatingSystem      804684 non-null object
geoNetworkcity             804684 non-null object
geoNetworkcontinent        804684 non-null object
geoNetworkcountry          804684 non-null object
geoNetworkmetro            804684 non-null object
geoNetworknetworkDomain    804684 non-null object
geoN

## 2.2 Data Visualization

In [6]:
# prepare functions for visualization
def horizontal_bar_chart(cnt_srs, color):
    trace = go.Bar(
        y=cnt_srs.index[::-1],
        x=cnt_srs.values[::-1],
        showlegend=False,
        orientation='h',
        marker=dict(
            color=color,
        ),
    )
    return trace


def scatter_plot(cnt_srs, color):
    trace = go.Scatter(
        x=cnt_srs.index[::-1],
        y=cnt_srs.values[::-1],
        showlegend=False,
        marker=dict(
            color=color,
        ),
    )
    return trace


class Visualization:
    def __init__(self, train_df):
        self.train_df = train_df

    def plot_diff_traffic_importance(self):
        # Continent
        cnt_srs = self.train_df.groupby('trafficSourcesource')['totalstransactionRevenue'].agg(['size', self.count_nonzero, 'mean'])
        cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
        cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
        trace1 = horizontal_bar_chart(cnt_srs["count"].head(10), 'green')
        trace2 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(10), 'green')
        trace3 = horizontal_bar_chart(cnt_srs["mean"].head(10), 'green')

        # Sub-continent
        cnt_srs = self.train_df.groupby('trafficSourcemedium')['totalstransactionRevenue'].agg(['size', self.count_nonzero, 'mean'])
        cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
        cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
        trace4 = horizontal_bar_chart(cnt_srs["count"], 'purple')
        trace5 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"], 'purple')
        trace6 = horizontal_bar_chart(cnt_srs["mean"], 'purple')

        # Creating two subplots
        fig = tools.make_subplots(rows=2, cols=3, vertical_spacing=0.08, horizontal_spacing=0.15,
                                  subplot_titles=["Traffic Source - Count", "Traffic Source - Non-zero Revenue Count",
                                                  "Traffic Source - Mean Revenue",
                                                  "Traffic Source Medium - Count",
                                                  "Traffic Source Medium - Non-zero Revenue Count",
                                                  "Traffic Source Medium - Mean Revenue"
                                                  ])

        fig.append_trace(trace1, 1, 1)
        fig.append_trace(trace2, 1, 2)
        fig.append_trace(trace3, 1, 3)
        fig.append_trace(trace4, 2, 1)
        fig.append_trace(trace5, 2, 2)
        fig.append_trace(trace6, 2, 3)

        fig['layout'].update(height=1000, width=1200, paper_bgcolor='rgb(233,233,233)', title="Traffic Source Plots")
        py.iplot(fig, filename='traffic-source-plots')

    def plot_diff_device_importance(self):
        # Device Browser
        cnt_srs = self.train_df.groupby('devicebrowser')['totalstransactionRevenue'].agg(['size', self.count_nonzero, 'mean'])
        cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
        cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
        trace1 = horizontal_bar_chart(cnt_srs["count"].head(10), 'rgba(50, 171, 96, 0.6)')
        trace2 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(10), 'rgba(50, 171, 96, 0.6)')
        trace3 = horizontal_bar_chart(cnt_srs["mean"].head(10), 'rgba(50, 171, 96, 0.6)')

        # Device Category
        cnt_srs = self.train_df.groupby('devicedeviceCategory')['totalstransactionRevenue'].agg(['size', self.count_nonzero, 'mean'])
        cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
        cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
        trace4 = horizontal_bar_chart(cnt_srs["count"].head(10), 'rgba(71, 58, 131, 0.8)')
        trace5 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(10), 'rgba(71, 58, 131, 0.8)')
        trace6 = horizontal_bar_chart(cnt_srs["mean"].head(10), 'rgba(71, 58, 131, 0.8)')

        # Operating system
        cnt_srs = self.train_df.groupby('deviceoperatingSystem')['totalstransactionRevenue'].agg(['size', self.count_nonzero, 'mean'])
        cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
        cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
        trace7 = horizontal_bar_chart(cnt_srs["count"].head(10), 'rgba(246, 78, 139, 0.6)')
        trace8 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(10), 'rgba(246, 78, 139, 0.6)')
        trace9 = horizontal_bar_chart(cnt_srs["mean"].head(10), 'rgba(246, 78, 139, 0.6)')

        # Creating two subplots
        fig = tools.make_subplots(rows=3, cols=3, vertical_spacing=0.04,
                                     subplot_titles=["Device Browser - Count",
                                                     "Device Browser - Non-zero Revenue Count",
                                                     "Device Browser - Mean Revenue",
                                                     "Device Category - Count",
                                                     "Device Category - Non-zero Revenue Count",
                                                     "Device Category - Mean Revenue",
                                                     "Device OS - Count", "Device OS - Non-zero Revenue Count",
                                                     "Device OS - Mean Revenue"])

        fig.append_trace(trace1, 1, 1)
        fig.append_trace(trace2, 1, 2)
        fig.append_trace(trace3, 1, 3)
        fig.append_trace(trace4, 2, 1)
        fig.append_trace(trace5, 2, 2)
        fig.append_trace(trace6, 2, 3)
        fig.append_trace(trace7, 3, 1)
        fig.append_trace(trace8, 3, 2)
        fig.append_trace(trace9, 3, 3)

        fig['layout'].update(height=1200, width=1200, paper_bgcolor='rgb(233,233,233)', title="Device Plots")
        py.iplot(fig, filename='device-plots')

    def plot_diff_geo_importance(self):
        # Continent
        cnt_srs = self.train_df.groupby('geoNetworkcontinent')['totalstransactionRevenue'].agg(['size', self.count_nonzero, 'mean'])
        cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
        cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
        trace1 = horizontal_bar_chart(cnt_srs["count"].head(10), 'rgba(58, 71, 80, 0.6)')
        trace2 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(10), 'rgba(58, 71, 80, 0.6)')
        trace3 = horizontal_bar_chart(cnt_srs["mean"].head(10), 'rgba(58, 71, 80, 0.6)')

        # Sub-continent
        cnt_srs = self.train_df.groupby('geoNetworksubContinent')['totalstransactionRevenue'].agg(
            ['size', self.count_nonzero, 'mean'])
        cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
        cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
        trace4 = horizontal_bar_chart(cnt_srs["count"], 'orange')
        trace5 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"], 'orange')
        trace6 = horizontal_bar_chart(cnt_srs["mean"], 'orange')

        # Network domain
        cnt_srs = self.train_df.groupby('geoNetworknetworkDomain')['totalstransactionRevenue'].agg(
            ['size', self.count_nonzero, 'mean'])
        cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
        cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
        trace7 = horizontal_bar_chart(cnt_srs["count"].head(10), 'blue')
        trace8 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(10), 'blue')
        trace9 = horizontal_bar_chart(cnt_srs["mean"].head(10), 'blue')

        # Creating two subplots
        fig = tools.make_subplots(rows=3, cols=3, vertical_spacing=0.08, horizontal_spacing=0.15,
                                     subplot_titles=["Continent - Count", "Continent - Non-zero Revenue Count",
                                                     "Continent - Mean Revenue",
                                                     "Sub Continent - Count", "Sub Continent - Non-zero Revenue Count",
                                                     "Sub Continent - Mean Revenue",
                                                     "Network Domain - Count",
                                                     "Network Domain - Non-zero Revenue Count",
                                                     "Network Domain - Mean Revenue"])

        fig.append_trace(trace1, 1, 1)
        fig.append_trace(trace2, 1, 2)
        fig.append_trace(trace3, 1, 3)
        fig.append_trace(trace4, 2, 1)
        fig.append_trace(trace5, 2, 2)
        fig.append_trace(trace6, 2, 3)
        fig.append_trace(trace7, 3, 1)
        fig.append_trace(trace8, 3, 2)
        fig.append_trace(trace9, 3, 3)

        fig['layout'].update(height=1500, width=1200, paper_bgcolor='rgb(233,233,233)', title="Geography Plots")
        py.iplot(fig, filename='geo-plots.html')

    def plot_revenue_count_with_time(self):

        self.train_df['date'] = self.train_df['date'].apply(
            lambda x: datetime.date(int(str(x)[:4]), int(str(x)[4:6]), int(str(x)[6:])))
        self.train_df["totalstransactionRevenue"] = self.train_df["totalstransactionRevenue"].astype \
            (float)


        # size includes NaN values, count does not:
        cnt_srs = self.train_df.groupby('date')['totalstransactionRevenue'].agg(['size', self.count_nonzero])
        cnt_srs.columns = ["count", "count of non-zero revenue"]
        cnt_srs = cnt_srs.sort_index()
        # cnt_srs.index = cnt_srs.index.astype('str')
        trace1 = scatter_plot(cnt_srs["count"], 'red')
        trace2 = scatter_plot(cnt_srs["count of non-zero revenue"], 'blue')

        fig = tools.make_subplots(rows=2, cols=1, vertical_spacing=0.08,
                                     subplot_titles=["Date - Count", "Date - Non-zero Revenue count"])
        fig.append_trace(trace1, 1, 1)
        fig.append_trace(trace2, 2, 1)
        fig['layout'].update(height=800, width=800, paper_bgcolor='rgb(233,233,233)', title="Date Plots")
        py.iplot(fig, filename='date-plots')

    def plot_visit_importance(self):
        # Page views
        cnt_srs = self.train_df.groupby('totalspageviews')['totalstransactionRevenue'].agg(['size', self.count_nonzero, 'mean'])
        cnt_srs.columns = ["count", "count of non-zero revenue", "mean"]
        cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
        trace1 = horizontal_bar_chart(cnt_srs["count"].head(60), 'cyan')
        trace2 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(60), 'cyan')
        trace5 = horizontal_bar_chart(cnt_srs["mean"].head(60), 'cyan')

        # Hits
        cnt_srs = self.train_df.groupby('totalshits')['totalstransactionRevenue'].agg(['size', self.count_nonzero, 'mean'])
        cnt_srs.columns = ["count", "count of non-zero revenue", 'mean']
        cnt_srs = cnt_srs.sort_values(by="count", ascending=False)
        trace3 = horizontal_bar_chart(cnt_srs["count"].head(60), 'black')
        trace4 = horizontal_bar_chart(cnt_srs["count of non-zero revenue"].head(60), 'black')
        trace6 = horizontal_bar_chart(cnt_srs["mean"].head(60), 'black')

        # Creating two subplots
        fig = tools.make_subplots(rows=2, cols=3, vertical_spacing=0.08, horizontal_spacing=0.15,
                                  subplot_titles=["Total Pageviews - Count", "Total Pageviews - Non-zero Revenue Count",
                                                  "Total Pageviews - Mean Revenue",
                                                  "Total Hits - Count", "Total Hits - Non-zero Revenue Count",
                                                  "Total Hits - Mean Revenue"])

        fig.append_trace(trace1, 1, 1)
        fig.append_trace(trace2, 1, 2)
        fig.append_trace(trace5, 1, 3)
        fig.append_trace(trace3, 2, 1)
        fig.append_trace(trace4, 2, 2)
        fig.append_trace(trace6, 2, 3)

        fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Visitor Profile Plots")
        py.iplot(fig, filename='visitor-profile-plots')

    def count_nonzero(self, x):
        return np.count_nonzero(x)


In [7]:
df_train = pd.read_csv("train_concise.csv")
vis = Visualization(df_train)

# a) with time
vis.plot_revenue_count_with_time()
# b) difference of device
vis.plot_diff_device_importance()
# c) traffic source
vis.plot_diff_traffic_importance()
# d) geo distribution
vis.plot_diff_geo_importance()
# e) visit profile
vis.plot_visit_importance()
del df_train


Columns (2) have mixed types. Specify dtype option on import or set low_memory=False.



This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x2,y2 ]



This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]  [ (1,3) x3,y3 ]
[ (2,1) x4,y4 ]  [ (2,2) x5,y5 ]  [ (2,3) x6,y6 ]
[ (3,1) x7,y7 ]  [ (3,2) x8,y8 ]  [ (3,3) x9,y9 ]



This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]  [ (1,3) x3,y3 ]
[ (2,1) x4,y4 ]  [ (2,2) x5,y5 ]  [ (2,3) x6,y6 ]



This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]  [ (1,3) x3,y3 ]
[ (2,1) x4,y4 ]  [ (2,2) x5,y5 ]  [ (2,3) x6,y6 ]
[ (3,1) x7,y7 ]  [ (3,2) x8,y8 ]  [ (3,3) x9,y9 ]



This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]  [ (1,3) x3,y3 ]
[ (2,1) x4,y4 ]  [ (2,2) x5,y5 ]  [ (2,3) x6,y6 ]



# 3 Train model

## 3.1 split data

In [8]:
def separate_data(train, test):
    features = list(train.columns.values.tolist())
    features.remove("totalstransactionRevenue")
    features.remove("fullVisitorId")
    features.remove("date")

    # Split the train dataset into development and valid based on time
    train['date'] = train['date'].apply(
        lambda x: datetime.date(int(str(x)[:4]), int(str(x)[4:6]), int(str(x)[6:])))
    test['date'] = test['date'].apply(
        lambda x: datetime.date(int(str(x)[:4]), int(str(x)[4:6]), int(str(x)[6:])))

    dev_df = train[train['date'] <= datetime.date(2017, 5, 31)]
    val_df = train[train['date'] > datetime.date(2017, 5, 31)]
    dev_y = np.log1p(dev_df["totalstransactionRevenue"].values)
    val_y = np.log1p(val_df["totalstransactionRevenue"].values)

    dev_X = dev_df[features]
    val_X = val_df[features]
    test_X = test[features]

    return dev_X, dev_y, val_X, val_y, test_X, dev_df, val_df

In [9]:
# category to number
def category_to_number(train, test):
    cat_cols = ['channelGrouping', 'devicebrowser',
                'devicedeviceCategory', 'deviceoperatingSystem',
                'geoNetworkcity', 'geoNetworkcontinent',
                'geoNetworkcountry', 'geoNetworkmetro',
                'geoNetworknetworkDomain', 'geoNetworkregion',
                'geoNetworksubContinent',
                'trafficSourcemedium',
                'trafficSourcesource']

    for col in cat_cols:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[col].values.astype('str')) + list(test[col].values.astype('str')))
        train[col] = lbl.transform(list(train[col].values.astype('str')))
        test[col] = lbl.transform(list(test[col].values.astype('str')))

    return train, test

## 3.2 models

In [10]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Bidirectional, Dropout
from keras.callbacks import ReduceLROnPlateau

from keras.layers import Input
from keras.models import Model
#lstm
#Using TensorFlow
def run_lstm(train_X, train_y, val_X, val_y, test_X):
    X_train = train_X
    X_val = val_X
    y_train = train_y
    y_val = val_y
    print("----shape----")
    print(X_train.shape)
    X_train = np.array(X_train).reshape(X_train.shape[0],1,X_train.shape[1])
    X_val = np.array(X_val).reshape(X_val.shape[0],1,X_val.shape[1])
    
    inputs = Input(shape=(1,19))
    x = Bidirectional(LSTM(200,recurrent_dropout=0.2, kernel_initializer='lecun_normal', return_sequences=True))(inputs)
    x = Bidirectional(LSTM(120,recurrent_dropout=0.2, kernel_initializer='lecun_normal'))(x)
    x = Dense(50, activation='sigmoid')(x)
    x = Dropout(0.1)(x)
    x = Dense(20,activation='elu')(x)
    output = Dense(1,activation='linear')(x)

    model2 = Model(inputs=inputs, outputs=output)
    model2.compile(loss='mse', optimizer='adam')
    model2.fit(X_train, y_train, epochs=4, batch_size=64, validation_data=(X_val, y_val), verbose=1, shuffle=False)
    
    test_X = test_X.values
    test_X = np.array(test_X).reshape((test_X.shape[0],1,test_X.shape[1]))

    pred_train_y = model2.predict(X_train)
    pred_val_y = model2.predict(X_val)
    pred_test_y = model2.predict(test_X)
    print("RMSE val: ")
    print(np.sqrt(metrics.mean_squared_error(y_val, pred_val_y)))
    #print("Rmse train:")
    #print(np.sqrt(metrics.mean_squared_error(y_train, pred_val_y)))
    return pred_test_y, model2, pred_val_y
    

Using TensorFlow backend.


In [11]:
# LightGBM
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective": "regression",
        "metric": "rmse",
        # max number of leaves in one tree
        "num_leaves": 30,
        # minimal number of data in one lea
        "min_child_samples": 100,
        "learning_rate": 0.1,
        "bagging_fraction": 0.7,
        "feature_fraction": 0.5,
        "bagging_freq": 5,
        "bagging_seed": 2018,
        "verbosity": -1
    }

    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)

    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=100)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_test_y[pred_test_y < 0] = 0

    pred_val_y = model.predict(val_X, num_iteration=model.best_iteration)
    pred_val_y[pred_val_y < 0] = 0
    return pred_test_y, model, pred_val_y

In [12]:
# XGB
def run_xgb(train_X, train_y, val_X, val_y, test_X):

    # fit model no training data
    model = XGBClassifier()
    model.fit(train_X, train_y)

    y_pred_val = model.predict(val_X)
    y_pred_val = [round(value) for value in y_pred_val]
    y_pred_val = [0 if i < 0 else i for i in y_pred_val]

    y_pred_test = model.predict(test_X)
    y_pred_test = [round(value) for value in y_pred_test]
    y_pred_test = [0 if i < 0 else i for i in y_pred_test]
    return y_pred_test, model, y_pred_val

## 3.3 Evaluation

In [13]:
# show feaure importance
def show_feature_importance(model):
    fig, ax = plt.subplots(figsize=(12, 18))
    lgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
    ax.grid(False)
    plt.title("LightGBM - Feature Importance", fontsize=15)
    plt.show()

In [14]:
# validation
def validate(val_df, pred_val):
    val_pred_df = pd.DataFrame({"fullVisitorId": val_df["fullVisitorId"].values})
    val_pred_df["transactionRevenue"] = val_df["totalstransactionRevenue"].values
    val_pred_df["PredictedRevenue"] = np.expm1(pred_val)
    val_pred_df = val_pred_df.groupby('fullVisitorId')[['transactionRevenue', 'PredictedRevenue']].sum().reset_index()
    print(np.sqrt(metrics.mean_squared_error(np.log1p(val_pred_df['transactionRevenue'].values),
                                             np.log1p(val_pred_df['PredictedRevenue'].values))))

## 3.4 Run model

### choose model

In [15]:
mod = 'LSTM' #'LGBM' / 'LSTM' / 'XGBOOST'

In [16]:
 # 1. load data to df, after parsing jason
df_train = pd.read_csv("train_concise.csv")
df_test = pd.read_csv("test_concise.csv")

# encode category to number
df_train, df_test = category_to_number(df_train, df_test)

print(df_train.info())
print(df_test.info())


Columns (2) have mixed types. Specify dtype option on import or set low_memory=False.



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 22 columns):
channelGrouping             903653 non-null int64
date                        903653 non-null int64
fullVisitorId               903653 non-null object
visitId                     903653 non-null int64
visitNumber                 903653 non-null int64
visitStartTime              903653 non-null int64
devicebrowser               903653 non-null int64
devicedeviceCategory        903653 non-null int64
deviceisMobile              903653 non-null bool
deviceoperatingSystem       903653 non-null int64
geoNetworkcity              903653 non-null int64
geoNetworkcontinent         903653 non-null int64
geoNetworkcountry           903653 non-null int64
geoNetworkmetro             903653 non-null int64
geoNetworknetworkDomain     903653 non-null int64
geoNetworkregion            903653 non-null int64
geoNetworksubContinent      903653 non-null int64
totalshits                  903653 non-

In [17]:
# group data frame by fullVisitorId
# gdf = revenue_customers(df_train, df_test)

# separate labels and split data
train_X, train_y, val_X, val_y, test_X, dev_df, val_df = separate_data(df_train, df_test)
print('==========final data==========')
print(train_X.shape)
print(train_y.shape)
print(val_X.shape)
print(val_y.shape)
print(test_X.shape)

# build and train model
if mod == 'LGBM':
    pred_test, model, pred_val = run_lgb(train_X, train_y, val_X, val_y, test_X)
    # validate the model
    validate(val_df, pred_val)
    # feature importance
    show_feature_importance(model)
    print('LGB done')
elif mod == 'LSTM':
     pred_test, model, pred_val = run_lstm(train_X, train_y, val_X, val_y, test_X)
     print('NN done')
elif mod == 'XGBOOST':
    pred_test, model, pred_val = run_xgb(train_X, train_y, val_X, val_y, test_X)
    validate(val_df, pred_val)
    print('XGBOOST done')

(765707, 19)
(765707,)
(137946, 19)
(137946,)
(804684, 19)
----shape----
(765707, 19)
Train on 765707 samples, validate on 137946 samples
Epoch 1/4