In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from Analyzer import Analyzer
import configparser
from glob import glob
from decimal import getcontext, Decimal
import json

class DecimalEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, Decimal):
            return str(obj)
        return json.JSONEncoder.default(self, obj)
    
if __name__ == '__main__':
    
    config = configparser.ConfigParser()
    config.read('config.txt')
    snapshots_files = sorted(glob(config["snapshot-export"]["path"]+"*.csv"))
    key = config["analyzer"]["key"]
    min_count = config["analyzer"].getint("min_count")
    version = config["report-export"]["report_version"]

    print("Start aggregating and analyzing...")
    pageAnalyzer = Analyzer(key, snapshots_files, config["report-export"]["path"])
    panel_snapshot = pageAnalyzer.load_accumulated_snapshot(filter_columns=["lead-Good", "lead-Bad"], min_count=min_count)
    pageAnalyzer.save_accumulated_snapshot() # store the cumulative result
    panel_report, bayesian_metrics, labelProportion = Analyzer.calc_metrics(panel_snapshot, key_column=key)
    
    # obtain export required metrics
    # getcontext().prec = 30
    # Pandas default precision
    
    labelProportion_Decimal = {k: Decimal(v) for k, v in labelProportion.items()}
    
    # pathMetrics = pd.concat([np.log(bayesian_metrics), panel_report["traffic"]], axis=1) # log version 
    pathMetrics = pd.concat([bayesian_metrics, panel_report["traffic"]], axis=1)
    columnSchema = list(pathMetrics.columns)

    pathMetrics_Decimal = pathMetrics.applymap(Decimal).T.to_dict("list")
    
    json_export = {
        "labelProportion": labelProportion_Decimal,
        "columnSchema": columnSchema,
        "pathMetrics": pathMetrics_Decimal
    }


    ################################
    # page_score json
    ################################
    with open("page_scores.json", 'wt', encoding='UTF-8') as f:
        json.dump(json_export, f, indent=4, cls=DecimalEncoder)
        
    print("Finished data analyzing and store the json data")
    
    ################################
    # valid url list
    ################################ 
    le = LabelEncoder()
    encode_urls = le.fit_transform(bayesian_metrics.index)
    np.save(f'./report/encode_urls_{version}.npy', le.classes_)
    print("largest code:", max(encode_urls))
    
    
    ################################
    # mcvisid report list
    ################################
    print("Processing probability report")
    preload = False
    unique_only = True
    if preload:
        request_list = pd.read_csv(f"./report/mcvisid_request_list_{version}.csv")
        request_list["page_code"] = request_list["page_code"].apply(eval)
    else: 
        test_sample_size = None
        files = sorted(glob("./data/aemRaw_keyColumns_2022*p1v1.csv.gz"))
        dfs = pd.DataFrame()
        for file in files:
            print("loading: ", file)
            df = pd.read_csv(file, compression="gzip", usecols=['mcvisid', 'clean_PageURL'], nrows=test_sample_size)
            dfs = pd.concat([dfs, df], axis=0)
        print("Finish loading")

        print("Aggregating user request list")    
        filter_dfs = dfs[dfs["clean_PageURL"].isin(le.classes_)]
        filter_dfs.loc[:, "page_code"] = le.transform(filter_dfs["clean_PageURL"].tolist())
        request_list = filter_dfs[["mcvisid", "page_code"]].groupby("mcvisid").apply(lambda x: x["page_code"].tolist()).rename("page_code").reset_index()
        request_list["unique_page_code"] = request_list["page_code"].apply(lambda x: list(set(x)))
        request_list["valid_request_count"] = request_list["unique_page_code"].apply(len)
        request_list.to_csv(f"./report/mcvisid_request_list_{version}.csv")
    print("Loaded request list report")
        
        
    if unique_only:
        request_list["page_code"] = request_list["unique_page_code"]
        request_list.drop(columns=["unique_page_code"],inplace=True)
    
    print("Processing probability")
    
    request_list_prob = Analyzer.mcvisid_probs(request_list, le, panel_report, bayesian_metrics, labelProportion)
    request_list_prob.to_csv(f"./report/mcvisid_request_list_probs-server_{version}.csv")
    # request_list_prob.to_excel("./report/mcvisid_request_list_probs.xlsx")
    target_list = request_list_prob[(request_list_prob["valid_request_count"]>5) & (request_list_prob["valid_request_count"]<500)]
    target_list.to_excel(f"./report/mcvisid_request_list_probs_target_{version}.xlsx")
    
    ################################
    # validation - merge mcvisid label to compare
    ################################
    


Start aggregating and analyzing...
loading ./snapshot/snapshot_20220401-20220415_p1v1.csv
loading ./snapshot/snapshot_20220415-20220430_p1v1.csv
loading ./snapshot/snapshot_20220501-20220515_p1v1.csv
loading ./snapshot/snapshot_20220515-20220531_p1v1.csv
loading ./snapshot/snapshot_20220601-20220615_p1v1.csv
loading ./snapshot/snapshot_20220615-20220630_p1v1.csv
loading ./snapshot/snapshot_20220701-20220715_p1v1.csv
loading ./snapshot/snapshot_20220715-20220731_p1v1.csv
loading ./snapshot/snapshot_20220801-20220815_p1v1.csv
loading ./snapshot/snapshot_20220815-20220831_p1v1.csv
loading ./snapshot/snapshot_20220901-20220915_p1v1.csv
loading ./snapshot/snapshot_20220915-20220930_p1v1.csv
loading ./snapshot/snapshot_20221001-20221015_p1v1.csv
loading ./snapshot/snapshot_20221015-20221031_p1v1.csv
loading ./snapshot/snapshot_20221101-20221115_p1v1.csv
loading ./snapshot/snapshot_20221115-20221130_p1v1.csv
loading ./snapshot/snapshot_20221201-20221215_p1v1.csv
loading ./snapshot/snapshot_20

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


Loaded request list report
Processing probability


In [5]:
target_list = request_list_prob[(request_list_prob["valid_request_count"]>5) & (request_list_prob["valid_request_count"]<500)]

## sample checking

In [40]:
pd.set_option('precision', 100)

### calc code

In [24]:
request_list_filter = request_list[(request_list["valid_request_count"]>5) & (request_list["valid_request_count"]<100)]

In [47]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
conditional_parts = bayesian_metrics.multiply(panel_report["traffic"], axis="rows").div(labelProportion, axis=1) 
conditional_parts = conditional_parts[conditional_parts.index.isin(le.classes_)]
conditional_parts.index = le.transform(conditional_parts.index)

## obtain TF Matrix
vec = CountVectorizer(token_pattern="[\d]+", vocabulary=[str(i) for i in conditional_parts.index])
dtm = vec.fit_transform(request_list_filter["page_code"].astype(str))

## matrix calculation of Naive Bayesian
ratio1 = dtm * np.log(conditional_parts + 1e-321)
const = np.log(np.array([labelProportion[col] for col in conditional_parts.columns]) + 1e-321);
nominators = np.exp(ratio1 + const)
nominators_df = pd.DataFrame(nominators, index=request_list_filter["mcvisid"], columns=conditional_parts.columns)

## aggregate by aspect
probs = pd.DataFrame()
aspects = ["lead", "role", "industry"]
category_group = [[col for col in conditional_parts.columns if aspect in col ] for aspect in aspects]
for group in category_group:
    probs[group] = nominators_df[group].div(nominators_df[group].sum(axis=1) + 1e-321, axis=0)


## wrap back to DF
mcvisid_probs = pd.DataFrame(probs, index=request_list_filter["mcvisid"], columns=conditional_parts.columns)        

result = pd.concat([request_list_filter.set_index("mcvisid"), mcvisid_probs], axis=1)


In [27]:
target_list = result
labels = pd.read_csv("./data/updated_labels_20230131.csv.gz", compression="gzip")
df = target_list.merge(labels, on="mcvisid")

In [69]:
dff = df.dropna(subset=["lead-Good"]).sort_values("lead-Good")
# dff[(dff["label_lead"] == dff["lead-Good"]) & (dff["valid_request_count"]<10) & (dff["lead-Good"]>0.2) & (dff["lead-Good"]<0.9)]
aspect = ["lead-Bad", "lead-Good"]
samples = dff[(dff["label_lead"] == (dff["lead-Good"]>0.5)) & (dff["valid_request_count"]<10) & (dff["lead-Good"]>0.2) & (dff["lead-Good"]<0.9)].sort_values("lead-Good")
samples = samples[['mcvisid', 'page_code','lead-Good', 'lead-Bad',]]

In [71]:
samples.tail(5).style

Unnamed: 0,mcvisid,page_code,lead-Good,lead-Bad
56093,62189114061023640704252020270838166935,"[3080, 4267, 813, 240, 881, 243, 824, 4092]",0.8853535731941194,0.1146464268058805
18466,20528088904142157941257503168888003119,"[4320, 4289, 243, 4307, 4346, 4347]",0.8856363410118667,0.1143636589881332
30601,33901220597667739294452190890532306865,"[4097, 3080, 3099, 1068, 1069, 3536, 4085, 950, 1147]",0.8927875903717792,0.1072124096282207
58430,64815100308319927083853690183259639484,"[4002, 2980, 4004, 3181, 557, 243, 3988, 4085, 3639]",0.8935970094506887,0.1064029905493112
48682,53999771292848229630751307185745266414,"[4000, 3972, 3655, 4267, 814, 336, 881, 3988, 4085]",0.8976239128500264,0.1023760871499735


In [100]:
# good example
visited_codes = [3080, 4267, 813, 240, 881, 243, 824, 4092]
target_user = samples[samples["page_code"].map(set(visited_codes).issubset)]
print(target_user["mcvisid"].iloc[0], "visited pages: " , pathMetrics.iloc[visited_codes].index.tolist())

62189114061023640704252020270838166935 visited pages:  ['/products/hardware/allen-bradley.html', '/support/product/product-downloads.html', '/en-us.html', '/company/events.html', '/lang-selection.html', '/company/events/in-person-events/automation-fair.html', '/index.html', '/support.html']


In [90]:
pageKYields = pathMetrics.iloc[visited_codes][["lead-Good","lead-Bad", "traffic"]]
pageKYields.style

Unnamed: 0_level_0,lead-Good,lead-Bad,traffic
clean_PageURL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
/products/hardware/allen-bradley.html,0.3376659951040921,0.6623340048959077,0.0541177422276307
/support/product/product-downloads.html,0.3531431590068673,0.6468568409931325,0.0088771808268369
/en-us.html,0.3400936351247378,0.6599063648752621,0.1165669273429948
/company/events.html,0.4139344262295082,0.5860655737704918,0.0008581743747021
/lang-selection.html,0.3256191037735848,0.674380896226415,0.0039766768838656
/company/events/in-person-events/automation-fair.html,0.3887577585453274,0.6112422414546727,0.0067053665793837
/index.html,0.8461538461538461,0.1538461538461538,7.6204008682e-06
/support.html,0.3814886254042094,0.6185113745957905,0.0206653547853479


In [103]:
np.log(conditional_parts.iloc[visited_codes].iloc[:,0:2] + 1e-321).cumsum().style
# conditional_parts.iloc[visited_codes].iloc[:,0:2].cumprod().style

Unnamed: 0,lead-Good,lead-Bad
3080,-2.956224613480584,-2.8957675448868425
4267,-7.675310981788329,-7.622858105822551
813,-9.857068337748926,-9.754994157827676
240,-16.75375265820919,-16.91721003734748
881,-22.357021761318652,-22.405668038645626
243,-27.26060107733754,-27.46996610031802
824,-38.16627011106874,-40.69363889876584
4092,-41.96317438477478,-44.620564297496735


In [99]:
a = nominators_df.loc[target_mcvisid].iloc[0:2]
a/a.sum()

lead-Good    0.88535357319411955856480744841974228620529174...
lead-Bad     0.11464642680588053857970720628145500086247920...
Name: 62189114061023640704252020270838166935, dtype: float64

In [None]:
session variable
1. quite
2. not share cross


In [101]:
target_user

Unnamed: 0,mcvisid,page_code,lead-Good,lead-Bad
56093,62189114061023640704252020270838166935,"[3080, 4267, 813, 240, 881, 243, 824, 4092]",0.88535357319411955856480744841974228620529174...,0.11464642680588053857970720628145500086247920...


## classification report

In [3]:
from sklearn.metrics import classification_report
def evaluate_report(df, aspect, label_col, label_map, ignore_unknown=True):
    reverse_map = {v:k for k,v in label_map.items()}
    if ignore_unknown:
        data = df[df[label_col]!=-1]
    probs = data[aspect]
    pred = probs.columns[np.argmax(probs.values, axis=1)].tolist() # mapping into columns string
    label = data[label_col].map(reverse_map) # mapping into string 
    print(classification_report(label, pred, zero_division=1))
    return data


In [7]:
aspect = ["lead-Bad", "lead-Good"]
label_map = {"lead-Bad": 0, "lead-Good": 1}
label_col = "label_lead"
data = evaluate_report(df, aspect, label_col, label_map)

              precision    recall  f1-score   support

    lead-Bad       0.95      0.77      0.85     70611
   lead-Good       0.11      0.43      0.18      4727

    accuracy                           0.75     75338
   macro avg       0.53      0.60      0.52     75338
weighted avg       0.90      0.75      0.81     75338



In [8]:
# aspect = ["role-Other", "role-Engineer", "role-Manager", "role-Csuite"]
# label_col = "label_jobLevel"

# Standardized_JobLevel_Map = {
#         "role-Csuite": 4,
#         "role-Manager": 3,
#         "role-Engineer": 2,
#         "role-Marketing": -1,
#         "role-Unknown": 0,
#         "role-Other": -1,
#         }

# data = evaluate_report(df, aspect, label_col, Standardized_JobLevel_Map)

               precision    recall  f1-score   support

  role-Csuite       0.34      0.11      0.17      2308
role-Engineer       0.52      0.09      0.16     13413
 role-Manager       0.53      0.49      0.51     16006
   role-Other       0.00      1.00      0.00         0
 role-Unknown       1.00      0.00      0.00     16871

     accuracy                           0.19     48598
    macro avg       0.48      0.34      0.17     48598
 weighted avg       0.68      0.19      0.22     48598



In [105]:
le.transform(filter_dfs["clean_PageURL"])

array([ 813,  813,  813, ..., 3346, 3346,  810])

In [118]:
filter_dfs["clean_PageURL"].map(le_dict)

8           813
9           813
10          813
11          813
12          813
           ... 
1906079       1
1906080       1
1906083    3346
1906084    3346
1906085     810
Name: clean_PageURL, Length: 29114466, dtype: int64