## Setup

In [1]:
# Connect using pyodbc, sqlalchemy, and pandas
import sqlalchemy
import numpy as np
import pandas as pd

server = "sqlsvr-0092-mdp-02.85f8a2f57eaf.database.windows.net"
database = "Staging"
username = "pisrc-inkoo"
password = input("Enter database password: ")
driver = "ODBC Driver 17 for SQL Server"

engine = sqlalchemy.create_engine(
    f"mssql+pyodbc://{username}:{password}@{server}/{database}?driver={driver.replace(' ', '+')}"
)


def query_db(query, params=None):
    return pd.read_sql(query, engine, params=params)


df = query_db("SELECT @@version;")
print(df[""][0])


Microsoft SQL Azure (RTM) - 12.0.2000.8 
	Apr 18 2022 13:01:43 
	Copyright (C) 2021 Microsoft Corporation



# Pathfactory Content Scoring

In [4]:
# ContentJourney is a varchar that lists the content viewed
# Query all pathfactory visitors that are in Eloqua and have an assigned leadstatus
# Aggregate their pathfactory visit information

df = query_db(
    """
    SELECT
        e.EmailAddress,
        MAX(l.ra_leadstage) AS ra_leadstage,
        SUM(p.AssetsViewed) AS AssetsViewed,
        STRING_AGG(CONVERT(NVARCHAR(max), p.ContentJourney), '') AS ContentJourney,
        SUM(p.EngagementScore) as EngagementScore,
        SUM(p.EngagementTime) as EngagementTime,
        STRING_AGG(CONVERT(NVARCHAR(max), p.ExperienceName), '; ') AS ExperienceName
    FROM
        Staging.elq.Contact AS e,
        (   
            SELECT emailaddress1, MAX(ra_leadstage) AS ra_leadstage
            FROM crm.Lead
            GROUP BY emailaddress1
        ) AS l,
        Staging.elq.PathFactory AS p
    WHERE e.EmailAddress = l.emailaddress1
        AND e.EloquaContactId = p.EloquaContactId
    GROUP BY
        e.EmailAddress;
    """
)
df


Unnamed: 0,EmailAddress,ra_leadstage,AssetsViewed,ContentJourney,EngagementScore,EngagementTime,ExperienceName
0,4lifefreedoms@gmail.com,3,1,[iSuite-ebook],0,17,InnovationSuite
1,565542979@qq.com,5,38,[data-ingest-demo][ThingWorx_8_Video][thingwor...,34,4968,InnovationSuite; Getting to know PTC; HMI; Dat...
2,a.baffoejames@yahoo.com,3,4,[LNS-Connected-Worker][HarborResearchFull][Pra...,8,88,Connected Worker; Digital Transformation; IIoT...
3,a.desai@seksaria.com,3,4,[3AgilePrinciplesDX][Caterpillar-Unearths][3Ag...,18,606,Industrial Analytics
4,a.espinoza31@yahoo.com,3,1,[georgia-pacific-connected-workforce],3,102,Connected Worker
...,...,...,...,...,...,...,...
3585,zimmerd557@gmail.com,2,1,[liveworxkeynote],0,24,LiveWorx
3586,zjna@novonordisk.com,5,3,[INFO-BR006A-EN-P][INFO-BR006A-EN-P][INFO-BR00...,2,32,FactoryTalk Quality Application; FactoryTalk Q...
3587,zrolnik@racontrols.pl,2,9,[NaturalSearchDemo][CMMs-Software-platform][CM...,10,1958,LiveWorx ; Fiix; Fiix; Fiix; Fiix; Is FactoryT...
3588,zwang@quantumscape.com,3,2,[PractitionerGuide-DeployingIIoT][Practitioner...,8,91,IIoT; IIoT


In [107]:
# Get all content items
pathfactory_content = set()
for index, row in df.iterrows():
    content_journey = row["ContentJourney"]
    content_list = content_journey[1:-2].split("][")

    for content_item in content_list:
        pathfactory_content.add(content_item)

len(pathfactory_content)


666

In [5]:
is_qualified_list = list()  # is a user a qualified lead or not
content_journey_list = list()  # split content journey items
for index, row in df.iterrows():
    # instead of for loop, use pandas df apply or map or something similar
    is_qualified = 1 if row["ra_leadstage"] >= 5 else 0  # see mapping above
    is_qualified_list.append(is_qualified)

    content_journey = row["ContentJourney"]
    content_list = content_journey[1:-1].split("][")
    content_set = set(content_list)
    content_journey_list.append(content_set)


# one-hot encode the content_journey
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
content_one_hot = pd.DataFrame(
    mlb.fit_transform(content_journey_list), columns=mlb.classes_
)
content_one_hot  # for each row, a 1 indicates that the user viewed that content
# if we have duration viewed for each content, replace one-hot with duration viewed
# for additional features on each content item, using "embedding" (ask Wei)
# consider adding engagement time (possibly normalize)


Unnamed: 0,0119-000163-3-d-vufo,1gs-w8w-h,1rx-ee-fd,24f3f0734389d0785593,3-agile-principles-ES,3-agile-principles-digital-transformation,3-ways-blog,3-ways-to-save-paperless,33seconds,3AgilePrinciplesDX,...,value-workshop,video-marketing-sale,videohillshirebrandquality,vp-operations,webinar-how-to-get-s,what-is-preventative-maintenance-fiix,whoisleadingdigitaltransformation,whoisleadingdigitaltransmation,why-register,window-into-extruder-ops-article
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3585,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3586,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3587,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3588,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# is_qualified_list is the target (Y)
# content_one_host is the input (X)
# currently not using any other features

# split into train and test sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    content_one_hot, is_qualified_list, test_size=1 / 3, random_state=None, shuffle=True
)


In [None]:
# check feature correlation for outliers
corr_train = pd.DataFrame(
    np.column_stack((y_train, x_train)),
    columns=(["qualified"] + x_train.columns.to_list()),
)
corr = corr_train.corr()
corr.iloc[0].dropna().sort_values()


In [51]:
from sklearn.naive_bayes import MultinomialNB

mnb_model = MultinomialNB()
mnb_model.fit(x_train, y_train)

mnb_model.score(x_test, y_test)


0.5472013366750209

In [57]:
# conditional probability for each content item
mnb_probs = np.exp(mnb_model.feature_log_prob_[0]) * 100
mbn_scores = pd.DataFrame(np.column_stack((mlb.classes_, mnb_probs)))
mbn_scores.to_csv("mnb_scores.csv")
mbn_scores


Unnamed: 0,0,1
0,0119-000163-3-d-vufo,0.02324
1,1gs-w8w-h,0.02324
2,1rx-ee-fd,0.046479
3,24f3f0734389d0785593,0.092958
4,3-agile-principles-ES,0.046479
...,...,...
395,what-is-preventative-maintenance-fiix,0.046479
396,whoisleadingdigitaltransformation,0.255636
397,whoisleadingdigitaltransmation,0.511271
398,why-register,0.162677


In [53]:
from sklearn.linear_model import LogisticRegression

lg_model = LogisticRegression(solver="newton-cg", penalty="l2")
lg_model.fit(x_train, y_train)

lg_model.score(x_test, y_test)
# gives suspiciously accurate result
# consider cross validation


0.5463659147869674

In [54]:
lg_scores = pd.DataFrame(np.column_stack((mlb.classes_, lg_model.coef_[0])))
lg_scores.to_csv("lg_scores.csv")
lg_scores


Unnamed: 0,0,1
0,0119-000163-3-d-vufo,0.0
1,1gs-w8w-h,0.20337
2,1rx-ee-fd,0.101769
3,24f3f0734389d0785593,0.601723
4,3-agile-principles-ES,0.350292
...,...,...
395,what-is-preventative-maintenance-fiix,-0.361093
396,whoisleadingdigitaltransformation,-0.100448
397,whoisleadingdigitaltransmation,-0.100636
398,why-register,0.344465


In [2]:
import pandas as pd
a = pd.read_csv("./data/aemRaw_keyColumns_20220401-20220415_p1v1.csv.gz", compression="gzip")

FileNotFoundError: [Errno 2] No such file or directory: 'engagement_scoring/data/aemRaw_keyColumns_20220401-20220415_p1v1.csv.gz'

## Notes

- Use different models to generate content scores, then calculate correlation between scores to compare models
- Consider how to limit the amount of data processed when calculating new scores (elbow method)
    - Perhaps limit to newest 10000 entries in the eloqua/adobe analytics
    - Consider how often to calculate and updates scores
- Store engagement scores in DB:
    - Table with columns: Binge Experience ID/name, content item name, engagement score
    - When we run data processing, then update engagement score for existing items, add new rows for new content as needed
    - Consider whether we should calculate score from scratch or adjust previous score
- Write calculated scores to AEM via endpoint


In [9]:
import numpy as np
import pandas as pd

from Analyzer import Analyzer
import configparser
from glob import glob
from decimal import getcontext, Decimal
import json

class DecimalEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, Decimal):
            return str(obj)
        return json.JSONEncoder.default(self, obj)
    
if __name__ == '__main__':
    
    config = configparser.ConfigParser()
    config.read('config.txt')
    snapshots_files = sorted(glob(config["snapshot-export"]["path"]+"*.csv"))
    key = config["analyzer"]["key"]
    min_count = config["analyzer"].getint("min_count")

    print("Start aggregating and analyzing...")
    pageAnalyzer = Analyzer(key, snapshots_files, config["report-export"]["path"])
    panel_snapshot = pageAnalyzer.load_accumulated_snapshot(filter_columns=["lead-Good", "lead-Bad"], min_count=min_count)
    pageAnalyzer.save_accumulated_snapshot() # store the cumulative result
    panel_report, bayesian_metrics, labelProportion = Analyzer.calc_metrics(panel_snapshot, key_column=key)
    
    # obtain export required metrics
    getcontext().prec = 10
    # Pandas default precision
    
    labelProportion = {k: Decimal(v) for k, v in labelProportion.items()}
    
    # pathMetrics = pd.concat([np.log(bayesian_metrics), panel_report["traffic"]], axis=1) # log version 
    pathMetrics = pd.concat([bayesian_metrics, panel_report["traffic"]], axis=1)
    columnSchema = list(pathMetrics.columns)

    pathMetrics_dense = pathMetrics.applymap(Decimal).T.to_dict("list")

    json_export = {
        "exp": Decimal(1)/Decimal(3),
        "labelProportion": labelProportion,
        "columnSchema": columnSchema,
        "pathMetrics": pathMetrics_dense
    }

    with open("page_scores-100.json", 'wt', encoding='UTF-8') as f:
        json.dump(json_export, f, indent=4, cls=DecimalEncoder)
        
    print("Finished data analyzing and store the json data")



Start aggregating and analyzing...
loading /home/samuel/src/engagement_scoring/snapshot/snapshot_20220401-20220415_p1v1.csv
loading /home/samuel/src/engagement_scoring/snapshot/snapshot_20220415-20220430_p1v1.csv
loading /home/samuel/src/engagement_scoring/snapshot/snapshot_20220501-20220515_p1v1.csv
loading /home/samuel/src/engagement_scoring/snapshot/snapshot_20220515-20220531_p1v1.csv
loading /home/samuel/src/engagement_scoring/snapshot/snapshot_20220601-20220615_p1v1.csv
loading /home/samuel/src/engagement_scoring/snapshot/snapshot_20220615-20220630_p1v1.csv
loading /home/samuel/src/engagement_scoring/snapshot/snapshot_20220701-20220715_p1v1.csv
loading /home/samuel/src/engagement_scoring/snapshot/snapshot_20220715-20220731_p1v1.csv
loading /home/samuel/src/engagement_scoring/snapshot/snapshot_20220801-20220815_p1v1.csv
loading /home/samuel/src/engagement_scoring/snapshot/snapshot_20220815-20220831_p1v1.csv
loading /home/samuel/src/engagement_scoring/snapshot/snapshot_20220901-2022

In [7]:
pathMetrics_dense

{'.html': [Decimal('0.1831513359541606933422741576578118838369846343994140625'),
  Decimal('0.72751780670680499252256367981317453086376190185546875'),
  Decimal('0.01432497609924904095890507704780247877351939678192138671875'),
  Decimal('0.17073822864240073560182509027072228491306304931640625'),
  Decimal('0.243524593687233725791685401418362744152545928955078125'),
  Decimal('0.5331214078017818547294837117078714072704315185546875'),
  Decimal('0'),
  Decimal('0'),
  Decimal('0.06590586419753086122597096618846990168094635009765625'),
  Decimal('0'),
  Decimal('0'),
  Decimal('0'),
  Decimal('0'),
  Decimal('0.263623456790123444903883864753879606723785400390625'),
  Decimal('0'),
  Decimal('0'),
  Decimal('0'),
  Decimal('0'),
  Decimal('0'),
  Decimal('0.06590586419753086122597096618846990168094635009765625'),
  Decimal('0'),
  Decimal('0'),
  Decimal('0'),
  Decimal('0'),
  Decimal('0.1318117283950617224519419323769398033618927001953125'),
  Decimal('0'),
  Decimal('0'),
  Decimal('0')