In [1]:
import pyspark
from pyspark.sql import functions as f, Window
import pandas as pd
from kedro.pipeline import *
from kedro.io import *
from kedro.runner import *

import pickle
import os
from pyspark.sql import SparkSession, DataFrame, functions as f
from typing import Dict

In [2]:
def create_spark_session() -> None:
    """
    Placeholder function to create the spark session for a run.

    """
    SparkSession.builder.config("spark.driver.memory", "16g").config(
        "spark.executor.memory", "16g"
    ).config("spark.driver.maxResultSize", "8g").master("local[*]").config(
        "spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"
    ).config(
        "spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.1.1,com.crealytics:spark-excel_2.11:0.11.1"
    ).config(
        "fs.s3a.access.key", ""
    ).config(
        "fs.s3a.secret.key", ""
    ).config(
        "fs.s3a.maxConnections", "5000"
    ).config(
        "spark.sql.execution.arrow.enabled", "true"
    ).config(
        "spark.debug.maxToStringFields", "100"
    ).config(
        "fs.s3a.connection.maximum", "5000"
    ).config(
        "spark.sql.shuffle.partitions", "8"
    ).config(
        "spark.sql.codegen.wholeStage", "false"
    ).appName(
        "comm-analytics"
    ).getOrCreate()

In [3]:
create_spark_session()
spark = SparkSession.builder.getOrCreate()

In [4]:
def load_orbit_df(path):
    bucket = "s3a://ucb-qb-ca-eu-west-1-data/npvu-de-data/physician_model"
    return spark.read.csv(path.format(bucket=bucket), header=True, inferSchema=True, sep=';')

In [5]:
raw_orbit_calls_market_access = load_orbit_df('{bucket}/raw/Orbit data/Activity data/Calls/Calls_Market Access.csv')

In [6]:
raw_orbit_calls_sales_medical = load_orbit_df('{bucket}/raw/Orbit data/Activity data/Calls/Calls_Sales + Medical.csv')

In [26]:
raw_orbit_calls_market_access.printSchema()

root
 |-- Call ID: string (nullable = true)
 |-- SFDC18 ID: string (nullable = true)
 |-- Call Name: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- External ID: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Parent Address: Brick 1: Name: string (nullable = true)
 |-- Territory: string (nullable = true)
 |-- Record Type: string (nullable = true)
 |-- Subtype: string (nullable = true)
 |-- Channel: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Detail Priority: integer (nullable = true)
 |-- Product: Product Name: string (nullable = true)
 |-- Current 4i: string (nullable = true)
 |-- Current Potential: string (nullable = true)



In [7]:
def prm_crm_call(
    raw_crm_call_sales_medical: DataFrame, raw_crm_call_market_access: DataFrame
) -> DataFrame:

    medical_renamed = (
        raw_crm_call_sales_medical.withColumnRenamed("External ID", "external_id")
        .withColumnRenamed("Call ID", "call_id")
        .withColumnRenamed("Date", "date")
        .withColumnRenamed("Channel", "channel")
    )

    market_renamed = (
        raw_crm_call_market_access.withColumnRenamed("External ID", "external_id")
        .withColumnRenamed("Call ID", "call_id")
        .withColumnRenamed("Date", "date")
        .withColumnRenamed("Channel", "channel")
    )

    unioned_calls = (
        only_relevant_fields(market_renamed)
        .unionByName(only_relevant_fields(medical_renamed))
        .distinct()
    )

    formatted_dates = unioned_calls.withColumn(
        "date", f.to_date(f.col("date"), "dd.MM.yyyy")
    )
    with_month = formatted_dates.withColumn("month", f.last_day(f.col("date")))

    return with_month


def only_relevant_fields(df: DataFrame) -> DataFrame:
    return df.select("external_id", "call_id", "date", "channel")

In [27]:
def prm_crm_call_1(
    raw_crm_call_sales_medical: DataFrame, raw_crm_call_market_access: DataFrame
) -> DataFrame:

    medical_renamed = (
        raw_crm_call_sales_medical.withColumnRenamed("External ID", "external_id")
        .withColumnRenamed("City", "city")
        .withColumnRenamed("Date", "date")
        .withColumnRenamed("Channel", "channel")
    )

    market_renamed = (
        raw_crm_call_market_access.withColumnRenamed("External ID", "external_id")
        .withColumnRenamed("City", "city")
        .withColumnRenamed("Date", "date")
        .withColumnRenamed("Channel", "channel")
    )

    unioned_calls = (
        only_relevant_fields(market_renamed)
        .unionByName(only_relevant_fields(medical_renamed))
        .distinct()
    )

    formatted_dates = unioned_calls.withColumn(
        "date", f.to_date(f.col("date"), "dd.MM.yyyy")
    )
    with_month = formatted_dates.withColumn("month", f.last_day(f.col("date")))

    return with_month


def only_relevant_fields(df: DataFrame) -> DataFrame:
    return df.select("external_id", "city", "date", "channel")

In [28]:
prm_crm_call_1 = prm_crm_call_1(raw_orbit_calls_sales_medical, raw_orbit_calls_market_access)

In [29]:
prm_crm_call_1.show(20)

+------------+-----------------+----------+------------+----------+
| external_id|             city|      date|     channel|     month|
+------------+-----------------+----------+------------+----------+
|WDEM02720311|          Hamburg|2017-08-25|Face to Face|2017-08-31|
|WDEA01483719|           Berlin|2017-08-09|Face to Face|2017-08-31|
|WDEM05574478|      Hattersheim|2017-08-25|Face to Face|2017-08-31|
|WDEM05686284|          Hamburg|2017-08-28|       Phone|2017-08-31|
|WDEP00007240|      Schweinfurt|2017-03-09|       Phone|2017-03-31|
|WDER00005461|             Kiel|2017-03-09|Face to Face|2017-03-31|
|WDEM08768303|Karlsruhe , Baden|2017-03-10|       Phone|2017-03-31|
|WDEA00030408|          Dresden|2017-03-03|      E-Mail|2017-03-31|
|WDEA01481607|           Berlin|2017-03-16|Face to Face|2017-03-31|
|WDEM02442927|          Leipzig|2017-02-10|      E-Mail|2017-02-28|
|WDEM07344311|         N�rnberg|2017-02-14|      E-Mail|2017-02-28|
|WDEM07344311|         N�rnberg|2017-02-17|     

In [8]:
prm_crm_call = prm_crm_call(raw_orbit_calls_sales_medical, raw_orbit_calls_market_access)

In [9]:
prm_crm_call.show(5)

+------------+---------------+----------+------------+----------+
| external_id|        call_id|      date|     channel|     month|
+------------+---------------+----------+------------+----------+
|WDEA01480207|a044A00001Drv6t|2017-08-22|Face to Face|2017-08-31|
|WDEM06966375|a044A00001Ds2nc|2017-08-24|       Phone|2017-08-31|
|WDEM07336892|a044A00001DsFc8|2017-08-25|      E-Mail|2017-08-31|
|WDEM08549267|a044A00001DsHmf|2017-08-24|       Phone|2017-08-31|
|WDEM00000964|a044A00001DsfnI|2017-08-25|Face to Face|2017-08-31|
+------------+---------------+----------+------------+----------+
only showing top 5 rows



In [23]:
def feat_crm_number_of_recent_activity(
    prm_crm_call: DataFrame) -> DataFrame:
    
    filters = ['Face to Face']
    filtered_activity = prm_crm_call.where(f.col('channel').isin(filters))
    
    
    calls_per_month = filtered_activity.groupBy("month", "external_id").agg(
        f.countDistinct(f.col("call_id")).alias("num_calls")
    )
    
    calls_per_month_copy = (
        calls_per_month.withColumnRenamed("num_calls", "calls_copy")
        .withColumnRenamed("external_id", "id_copy")
        .withColumnRenamed("month", "month_copy")
    )

    unaggregated_calls = calls_per_month.join(
        calls_per_month_copy,
        how="left",
        on=[
            calls_per_month.external_id == calls_per_month_copy.id_copy,
            f.datediff(calls_per_month.month, calls_per_month_copy.month_copy)
            <= 93,
        ],
    )

    aggregated_calls = unaggregated_calls.groupBy("external_id", "month").agg(
        f.sum("num_calls").alias("num_calls"), f.sum("calls_copy").alias("calls_copy")
    )

    combined_calls = aggregated_calls.withColumn(
        "number_of_recent_facetoface", f.col("num_calls") + f.col("calls_copy")
    )    

    return combined_calls.select("external_id", "month", "number_of_recent_facetoface")

In [30]:
def feat_crm_number_of_recent_activity_1(
    prm_crm_call: DataFrame) -> DataFrame:
    
    filters = ['Face to Face']
    filtered_activity = prm_crm_call.where(f.col('channel').isin(filters))
    
    
    calls_per_month = filtered_activity.groupBy("month", "external_id").agg(
        f.countDistinct(f.col("city")).alias("distinct_workplaces")
    )
    
    calls_per_month_copy = (
        calls_per_month.withColumnRenamed("distinct_workplaces", "distinct_workplaces_copy")
        .withColumnRenamed("external_id", "id_copy")
        .withColumnRenamed("month", "month_copy")
    )

    unaggregated_calls = calls_per_month.join(
        calls_per_month_copy,
        how="left",
        on=[
            calls_per_month.external_id == calls_per_month_copy.id_copy,
            f.datediff(calls_per_month.month, calls_per_month_copy.month_copy)
            <= 366,
        ],
    )

    aggregated_calls = unaggregated_calls.groupBy("external_id", "month").agg(
        f.sum("distinct_workplaces").alias("distinct_workplaces"),
        f.sum("distinct_workplaces_copy").alias("distinct_workplaces_copy")
    )

    combined_calls = aggregated_calls.withColumn(
        "number_of_distinct_workplaces", f.col("distinct_workplaces") + f.col("distinct_workplaces_copy")
    )    

    return combined_calls.select("external_id", "month", "number_of_distinct_workplaces")

In [31]:
feat_crm_number_of_recent_facetoface_1 = feat_crm_number_of_recent_activity_1(prm_crm_call_1)

In [32]:
feat_crm_number_of_recent_facetoface_1.show(20)

+--------------------+----------+-----------------------------+
|         external_id|     month|number_of_distinct_workplaces|
+--------------------+----------+-----------------------------+
|          A-04275216|2018-11-30|                            0|
|          A-04302573|2017-09-30|                            2|
|TP-PA-DE-2017-08-...|2018-01-31|                            2|
|        WDEA00001339|2015-11-30|                            4|
|        WDEA00001339|2016-11-30|                            4|
|        WDEA00001828|2018-05-31|                            4|
|        WDEA00001828|2018-04-30|                            4|
|        WDEA00004750|2015-07-31|                            0|
|        WDEA00005322|2014-12-31|                            1|
|        WDEA00005322|2016-08-31|                            2|
|        WDEA00010846|2016-01-31|                            2|
|        WDEA00024209|2018-12-31|                            4|
|        WDEA00024209|2019-03-31|       

In [24]:
feat_crm_number_of_recent_facetoface = feat_crm_number_of_recent_activity(prm_crm_call)

In [25]:
feat_crm_number_of_recent_facetoface.show(5)

+--------------------+----------+---------------------------+
|         external_id|     month|number_of_recent_facetoface|
+--------------------+----------+---------------------------+
|          A-04275216|2018-11-30|                          2|
|          A-04302573|2017-09-30|                          2|
|TP-PA-DE-2017-08-...|2018-01-31|                          2|
|        WDEA00001339|2015-11-30|                          4|
|        WDEA00001339|2016-11-30|                          2|
+--------------------+----------+---------------------------+
only showing top 5 rows



In [33]:
import sys
print(sys.executable)

/opt/miniconda/envs/ca-3.6/bin/python


In [5]:
import os
os.getcwd()

'/root'