### Libraries

In [1]:
import pandas as pd 
import numpy as np 
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
from typing import Dict, Text
from tensorflow.keras import layers  
from tensorflow.keras import Model
import tensorflow_ranking as tfr
from tensorflow.keras import optimizers
import pycountry_convert as pc
from scipy.stats import kendalltau
import json

import warnings
warnings.filterwarnings('ignore')

c:\Users\vijay.rameshkumar\Anaconda3\envs\myenv\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
c:\Users\vijay.rameshkumar\Anaconda3\envs\myenv\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll


### Weighted Harmonic Mean

In [2]:
def weighted_harmonic_mean(x, y, x_w=0.6, y_w=0.4):
    if x!=0.0 and y != 0.0:
        return 1 / ((x_w/x)+(y_w/y))
    elif x==0.0 and y <= 0.0:
        return 0.09
    elif y<=0:
        return 0
    elif x<=0:
        return 0

## Layer 1

### 1. Actual - continent_lang_study_subject

In [3]:
actual_layer1 = pd.read_csv('adj_matrx_v2/01_adjacency_continent_lang_study_subject_weighted_suppliers_average_profit.csv')
suppliers = actual_layer1.suppliers_info.values.tolist()
actual_layer1 = actual_layer1.set_index('suppliers_info')
actual_layer1 = actual_layer1.sort_index()
actual_layer1 = actual_layer1.stack().reset_index()
actual_layer1.columns = ['suppliers__ref', 'projects__study_types_subject_ids', 'positive_score']
actual_layer1['suppliers__ref'] = actual_layer1['suppliers__ref'].astype('str')
actual_layer1['positive_score'] = actual_layer1['positive_score'].astype('float32')
actual_layer1 = actual_layer1[actual_layer1.positive_score != 0.00]

def get_actual_layer1(supplier_key, panelist_key):
    return actual_layer1[(actual_layer1.suppliers__ref.str.startswith(supplier_key) & (actual_layer1.projects__study_types_subject_ids.str.startswith(panelist_key)))].sort_values(by='positive_score', ascending=False)[['suppliers__ref', 'positive_score']].values.tolist()

### Model1 - Ranking Prediction Model (RMSE)

In [3]:
with open('utils_v2/continent_lang_study_sub_unique_suppliers_subjects_lookup.json', 'r') as openfile:
    json_object = json.load(openfile)

unique_subjects = json_object['unique_subjects']
unique_suppliers = json_object['unique_suppliers']

class RankingModel(tf.keras.Model):

  def __init__(self):
    super().__init__()
    embedding_dimension = 64

    # Compute embeddings for users.
    self.supplier_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_suppliers, mask_token=None),
      tf.keras.layers.Embedding(len(unique_suppliers) + 1, embedding_dimension)
    ])

    # Compute embeddings for movies.
    self.subject_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_subjects, mask_token=None),
      tf.keras.layers.Embedding(len(unique_subjects) + 1, embedding_dimension)
    ])

    # Compute predictions.
    self.ratings = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="leaky_relu"),
      tf.keras.layers.Dense(32, activation="leaky_relu"),
      tf.keras.layers.Dense(16, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
  ])

  def call(self, inputs):

    supplier_id, subject_id = inputs

    supplier_embedding = self.supplier_embeddings(supplier_id)
    subject_embedding = self.subject_embeddings(subject_id)

    return self.ratings(tf.concat([supplier_embedding, subject_embedding], axis=1))

class SupplierRecommender(tfrs.models.Model):

  def __init__(self):
    super().__init__()
    self.ranking_model: tf.keras.Model = RankingModel()
    self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
      loss = tf.keras.losses.MeanSquaredError(),
      metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )

  def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
    return self.ranking_model(
        (features["supplier_id"], features["subject_id"]))

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    labels = features.pop("score")

    rating_predictions = self(features)

    # The task computes the loss and the metrics.
    return self.task(labels=labels, predictions=rating_predictions)

In [4]:
model1 = SupplierRecommender()
model1.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))
model1.load_weights('models_2/final_model_weights')


def model1_predict(list_of_suppliers, subjects, model=model1):
    preds = []

    for i in list_of_suppliers:
        prediction = model({
            "supplier_id": np.array([i]),
            "subject_id": np.array([subjects])
            }).numpy().tolist()[0][0]
        
        preds.append([i, prediction])
    return preds

In [72]:
with open('utils_v2/layer1_suppliers_lookup.json', 'r') as openfile:
    layer1_suppliers_lookup = json.load(openfile)

with open('utils_v2/layer2_suppliers_lookup.json', 'r') as openfile:
    layer2_suppliers_lookup = json.load(openfile)

with open('Lookups/continent_level_ranking.json', 'r') as openfile:
    layer3_suppliers_lookup = json.load(openfile)

with open('Lookups/global_level_ranking.json', 'r') as openfile:
    layer4_suppliers_lookup = json.load(openfile)
layer4_suppliers_lookup = [list(i) for i in list(layer4_suppliers_lookup.items())]


def infer(supplier_key, panelist_key, top_k=10, suppliers=None):

    req = top_k

    key1 = supplier_key + "@" + panelist_key.split("@")[0]
    key2 = supplier_key.split("@")[0] + "@" + panelist_key.split("@")[0]
    key3 = supplier_key.split("@")[0]
    
    keys = supplier_key+"@"+panelist_key

    ############# layer 1 ##############
    supp = []
    if suppliers == None:
        for i in layer1_suppliers_lookup.keys():
            if i.startswith(key1):
                supp.extend(layer1_suppliers_lookup[i])

        for i in unique_suppliers:
            if i.startswith(key1):
                supp.extend(layer1_suppliers_lookup[i])
        
    elif suppliers:
        supp.extend(suppliers)
    
    supp = list(set(supp))

    model1_pred_order = model1_predict(supp, panelist_key)
    # model1_pred_order.sort(key=lambda x:x[1], reverse=True)
    suppliers1 = pd.DataFrame(model1_pred_order[:top_k], columns=['supplier_ref', 'score'])
    suppliers1 = suppliers1.sort_values(by='score')

    if suppliers == None:
        suppliers1['info'] = 'l1'
    elif suppliers != None:
        suppliers1['info'] = 'IR'
    
    suppliers1['key'] = key1
    req = len(suppliers1)
    final = suppliers1

    ############## layer 2 ##############
    if req < top_k:
        l2 = layer2_suppliers_lookup.get(key2, None)

        if l2 != None:
            suppliers2 = pd.DataFrame(l2.items(), columns=['supplier_ref', 'score'])
            suppliers2 = suppliers2.head(top_k)
            suppliers2['info'] = 'l2'
            suppliers2['key'] = key2
        else:
            suppliers2 = pd.DataFrame()
        
        final = pd.concat([suppliers1, suppliers2])
        final.drop_duplicates(subset='supplier_ref', inplace=True)
        final = final.head(top_k)
        req = req - len(final)
    
    # ############## layer 3 ##############
        if req <= 0:
            l3 = layer3_suppliers_lookup.get(key3, None)

            if l3 != None:
                suppliers3 = pd.DataFrame(l3.items(), columns=['supplier_ref', 'score'])
                suppliers3['info'] = 'l3'
                suppliers3['key'] = key3
                
            else:
                suppliers3 = pd.DataFrame()
            
            final = pd.concat([final, suppliers3])
            final.drop_duplicates(subset='supplier_ref', inplace=True)
            final = final.head(top_k)
            req = req - len(final)

    # ############# layer 4 ###############
            if req <= 0:
                suppliers4 = pd.DataFrame(layer4_suppliers_lookup[:top_k], columns=['supplier_ref', 'score'])
                suppliers4['info'] = 'l4'
                suppliers4['key'] = 'global'
                final = pd.concat([final, suppliers4])
                final.drop_duplicates(subset='supplier_ref', inplace=True)
                final = final.head(top_k)

    return final.reset_index(drop=True)

In [73]:
# import itertools

# cont_lang = actual_layer1.suppliers__ref.apply(lambda x: "@".join(x.split('@')[:-1])).unique().tolist()
# study_sub = actual_layer1.projects__study_types_subject_ids.unique().tolist()
# total = [cont_lang, study_sub]
# combinations = [p for p in itertools.product(*total)]

# for index, comb in enumerate(combinations):

#     if index == 1700:
#         final1 = infer(comb[0], comb[1])
#         final2 = infer(comb[0], comb[1], suppliers=final1.supplier_ref.values.tolist())
#         print(comb)
#         break

### Inference for query dataset

In [74]:
query = pd.read_csv('test/query.csv')

query['supplier_info'] = query['continents'] + "@" + query['sample_pulls__language']
query['study_types'] = query['projects__study_types_ids'] + "@" + query['projects__study_types_subject_ids']
query = query[['projects__name', 'supplier_info', 'study_types', 'suppliers_ref']]
query


Unnamed: 0,projects__name,supplier_info,study_types,suppliers_ref
0,Campaign Monitor- Spain post wave (P99272),europe@eng,consumer_study@other,271537113600
1,"18+ Department: IT, Operations/production, Pu...",oceania@eng,b2b@it_decision_maker,524301442574293
2,25-55 gen pop,europe@eng,consumer_study@other,271
3,30k+ Per month HHI,asia@eng,consumer_study@other,271
4,811802 - Australia,oceania@eng,consumer_study@other,271537601600
5,Advertising/Marketing/Media BDMs,asia@eng,b2b@marketing_advertising,537271
6,Age 18-45 YO - Drinkers,oceania@eng,consumer_study@other,124574391301442601537271
7,Apple TV First Look (Other # 9) MX,north-america@eng,consumer_study@other,601537271
8,"Australia, M&F, 18-44 & US, M&F, 18-65, homeow...",oceania@eng,consumer_study@other,438588574301
9,CA-Luxury Quebec French,north-america@eng,consumer_study@other,600601537271448500


In [75]:
# IR_rec = pd.read_csv('test/suggested_suppliers_final.csv')

# for col in IR_rec.columns:
#     IR_rec[col] = IR_rec[col].apply(lambda x: x.strip() if type(x) == 'str' else x)

# IR_rec['req_study_type'] = IR_rec['req_study_type'].str.lower().apply(lambda x: x.replace(" ", "_"))
# IR_rec['req_continent'] = IR_rec['req_continent'].str.lower().apply(lambda x: x.replace(" ", "-"))

# IR_rec['req_subject'] = IR_rec['req_subject'].str.lower()

# IR_rec['supplier_info'] = IR_rec['req_continent'] + "@" + IR_rec['req_lang']
# IR_rec['study_info'] = IR_rec['req_study_type'] + "@" + IR_rec['req_subject']

# IR_rec = IR_rec[['projects__name', 'supplier_info', 'study_info', 'suggested_rl']]


In [76]:
# project = []
# study_type = []
# supplier_info = []
# actual_supplier_info = []
# pred1_supplier_info = []
# pred2_supplier_info = []
# total_layers = []

# for j in IR_rec.itertuples():
#     j = j[1:]
#     supp = j[-1].strip('}{').split(',')
#     final1 = infer(j[-3], j[-2], suppliers=supp)
#     # final2 = infer2(j[-3], j[-2])
#     # final3 = infer3(j[-3], j[-2])

#     project.append(j[0])
#     study_type.append(j[-2])
#     supplier_info.append(j[-3])
#     # actual_supplier_info.append(j[-1])
#     pred1_supplier_info.append(final1.supplier_ref.values.tolist())
#     # pred2_supplier_info.append(final2.supplier_ref.values.tolist())
#     # total_layers.append(final3)

In [77]:
# test = pd.DataFrame()

# test['project_name'] = pd.Series(project)
# test['study_type'] = pd.Series(study_type)
# test['supplier_info'] = pd.Series(supplier_info)
# # test['actual_supplier_info'] = pd.Series(actual_supplier_info)
# test['pred_supplier_ref'] = pd.Series(pred1_supplier_info)
# test['continent'] = test.supplier_info.apply(lambda x: x.split("@")[0])
# test['languages'] = test.supplier_info.apply(lambda x: x.split("@")[1])
# # test['pred2_supplier_info'] = pd.Series(pred2_supplier_info)

# test[['project_name', 'continent', 'languages', 'study_type', 'pred_supplier_ref']].to_csv('test/IR_rec.csv', index=False)

In [78]:
project = []
study_type = []
supplier_info = []
actual_supplier_info = []
k10_pred1_supplier_info = []
k20_pred1_supplier_info = []
total_layers = []

for j in query.itertuples():
    j = j[1:]
    final1 = infer(j[-3], j[-2])
    final2 = infer(j[-3], j[-2], top_k=20)
    # final3 = infer3(j[-3], j[-2])

    project.append(j[0])
    study_type.append(j[-2])
    supplier_info.append(j[-3])
    actual_supplier_info.append([str(k) for k in j[-1].split(",")])
    k10_pred1_supplier_info.append(final1.supplier_ref.values.tolist())
    k20_pred1_supplier_info.append(final2.supplier_ref.values.tolist())
    # total_layers.append(final3)

In [79]:
test = pd.DataFrame()

test['project_name'] = pd.Series(project)
test['study_type'] = pd.Series(study_type)
test['supplier_info'] = pd.Series(supplier_info)
test['actual_supplier_info'] = pd.Series(actual_supplier_info)
test['k10@pred_supplier_ref'] = pd.Series(k10_pred1_supplier_info)
test['k20@pred_supplier_ref'] = pd.Series(k20_pred1_supplier_info)
test['continent'] = test.supplier_info.apply(lambda x: x.split("@")[0])
test['languages'] = test.supplier_info.apply(lambda x: x.split("@")[1])

test['k10_common'] = list(map(lambda x,y: len(set([str(i) for i in x]).intersection(y)), test.actual_supplier_info.values.tolist(), test['k10@pred_supplier_ref'].values.tolist()))
test['k10_difference'] = list(map(lambda x,y: len(set([str(i) for i in x]).difference(y)), test.actual_supplier_info.values.tolist(), test['k10@pred_supplier_ref'].values.tolist()))

test['k20_common'] = list(map(lambda x,y: len(set([str(i) for i in x]).intersection(y)), test.actual_supplier_info.values.tolist(), test['k20@pred_supplier_ref'].values.tolist()))
test['k20_difference'] = list(map(lambda x,y: len(set([str(i) for i in x]).difference(y)), test.actual_supplier_info.values.tolist(), test['k20@pred_supplier_ref'].values.tolist()))

test[['project_name', 'continent', 'languages', 'study_type', 'actual_supplier_info', 'k10@pred_supplier_ref', 'k20@pred_supplier_ref', 'k10_common', 'k10_difference', \
    'k20_common', 'k20_difference']].to_csv('test/standlaone_recsys.csv', index=False)
# test['pred2_supplier_info'] = pd.Series(pred2_supplier_info)

test

Unnamed: 0,project_name,study_type,supplier_info,actual_supplier_info,k10@pred_supplier_ref,k20@pred_supplier_ref,continent,languages,k10_common,k10_difference,k20_common,k20_difference
0,Campaign Monitor- Spain post wave (P99272),consumer_study@other,europe@eng,"[271, 537, 113, 600]","[160, 590, 496, 558, 549, 124, 574, 586, 566, ...","[160, 590, 496, 558, 549, 124, 574, 586, 566, ...",europe,eng,0,4,1,3
1,"18+ Department: IT, Operations/production, Pu...",b2b@it_decision_maker,oceania@eng,"[524, 301, 442, 574, 293]","[301, 229, 588, 72, 566, 543, 524, 359, 574, 528]","[301, 229, 588, 72, 566, 543, 524, 359, 574, 5...",oceania,eng,3,2,3,2
2,25-55 gen pop,consumer_study@other,europe@eng,[271],"[160, 590, 496, 558, 549, 124, 574, 586, 566, ...","[160, 590, 496, 558, 549, 124, 574, 586, 566, ...",europe,eng,0,1,0,1
3,30k+ Per month HHI,consumer_study@other,asia@eng,[271],"[594, 445, 141, 359, 341, 588, 471, 144, 566, ...","[594, 445, 141, 359, 341, 588, 471, 144, 566, ...",asia,eng,0,1,0,1
4,811802 - Australia,consumer_study@other,oceania@eng,"[271, 537, 601, 600]","[293, 144, 586, 566, 450, 496, 124, 600, 341, ...","[293, 144, 586, 566, 450, 496, 124, 600, 341, ...",oceania,eng,1,3,4,0
5,Advertising/Marketing/Media BDMs,b2b@marketing_advertising,asia@eng,"[537, 271]","[460, 574, 559, 183, 588, 72, 301, 388, 124, 592]","[460, 574, 559, 183, 588, 72, 301, 388, 124, 5...",asia,eng,0,2,1,1
6,Age 18-45 YO - Drinkers,consumer_study@other,oceania@eng,"[124, 574, 391, 301, 442, 601, 537, 271]","[293, 144, 586, 566, 450, 496, 124, 600, 341, ...","[293, 144, 586, 566, 450, 496, 124, 600, 341, ...",oceania,eng,1,7,5,3
7,Apple TV First Look (Other # 9) MX,consumer_study@other,north-america@eng,"[601, 537, 271]","[450, 410, 590, 578, 496, 124, 445, 72, 533, 586]","[450, 410, 590, 578, 496, 124, 445, 72, 533, 5...",north-america,eng,0,3,0,3
8,"Australia, M&F, 18-44 & US, M&F, 18-65, homeow...",consumer_study@other,oceania@eng,"[438, 588, 574, 301]","[293, 144, 586, 566, 450, 496, 124, 600, 341, ...","[293, 144, 586, 566, 450, 496, 124, 600, 341, ...",oceania,eng,1,3,2,2
9,CA-Luxury Quebec French,consumer_study@other,north-america@eng,"[600, 601, 537, 271, 448, 500]","[450, 410, 590, 578, 496, 124, 445, 72, 533, 586]","[450, 410, 590, 578, 496, 124, 445, 72, 533, 5...",north-america,eng,0,6,1,5


In [17]:
# for i in test[['actual_supplier_info', 'pred1_supplier_info', 'pred2_supplier_info', 'study_type', 'supplier_info']].values.tolist():
#     print("\n", i[3:])
#     print(i[0].split(','), i[1])

#     print(f"model1 : common suppliers : {set(i[0].split(',')).intersection(set(i[1]))}")
#     print(f"model1 : difference suppliers : {set(i[0].split(',')).difference(set(i[1]))}")

#     print(f"model2 : common suppliers : {set(i[0].split(',')).intersection(set(i[2]))}")
#     print(f"model2 : difference suppliers : {set(i[0].split(',')).difference(set(i[2]))}")

In [None]:
test

Unnamed: 0,project,study_type,supplier_info,actual_supplier_info,pred1_supplier_info,pred2_supplier_info
0,Campaign Monitor- Spain post wave (P99272),consumer_study@other,europe@eng,271537113600,"[271, 593, 590, 445, 588, 601, 458, 586, 574, ...","[160, 590, 496, 558, 549, 124, 574, 586, 566, ..."
1,"18+ Department: IT, Operations/production, Pu...",b2b@it_decision_maker,oceania@eng,524301442574293,"[271, 593, 445, 588, 542, 601, 229, 574, 572, ...","[301, 229, 588, 72, 566, 543, 524, 359, 574, 528]"
2,30k+ Per month HHI,consumer_study@other,asia@eng,271,"[271, 593, 471, 445, 588, 601, 458, 574, 572, ...","[594, 445, 141, 359, 341, 588, 471, 144, 566, ..."
3,811802 - Australia,consumer_study@other,oceania@eng,271537601600,"[271, 593, 588, 601, 458, 586, 574, 438, 359, ...","[293, 144, 586, 566, 450, 496, 124, 600, 341, ..."
4,Advertising/Marketing/Media BDMs,b2b@marketing_advertising,asia@eng,537271,"[271, 593, 445, 588, 542, 601, 586, 229, 574, ...","[460, 574, 559, 183, 588, 72, 301, 388, 124, 592]"
5,Age 18-45 YO - Drinkers,consumer_study@other,oceania@eng,124574391301442601537271,"[271, 593, 588, 601, 458, 586, 574, 438, 359, ...","[293, 144, 586, 566, 450, 496, 124, 600, 341, ..."
6,Apple TV First Look (Other # 9) MX,consumer_study@other,north-america@eng,271,"[271, 593, 527, 401, 458, 183, 46, 591, 533, 581]","[450, 410, 590, 578, 496, 124, 445, 72, 533, 586]"
7,"Australia, M&F, 18-44 & US, M&F, 18-65, homeow...",consumer_study@other,oceania@eng,438588574301,"[271, 593, 588, 601, 458, 586, 574, 438, 359, ...","[293, 144, 586, 566, 450, 496, 124, 600, 341, ..."
8,CA-Luxury Quebec French,consumer_study@other,north-america@eng,600601537271500,"[271, 593, 527, 401, 458, 183, 46, 591, 533, 581]","[450, 410, 590, 578, 496, 124, 445, 72, 533, 586]"
9,Construction and Trades people,b2b@other,europe@eng,293301442574588572,"[271, 593, 549, 445, 588, 542, 601, 458, 586, ...","[460, 141, 124, 301, 574, 566, 592, 588, 572, ..."


In [None]:
l1_matches = []
l1_mismatches = []

l2_matches = []
l2_mismatches = []

l3_matches = []
l3_mismatches = []

l4_matches = []
l4_mismatches = []

total_length = []

model1_matches = []
model1_mismatches = []

model2_matches = []
model2_mismatches = []

for idx, i in enumerate(actual_supplier_info):
    i = i.split(",")

    l1 = set(total_layers[idx][0].supplier_ref.values.tolist())
    l2 = set(total_layers[idx][1].supplier_ref.values.tolist())
    l3 = set(total_layers[idx][2].supplier_ref.values.tolist())
    l4 = set(total_layers[idx][3].supplier_ref.values.tolist())

    model1_matches.append(len(set(i).intersection(set(pred1_supplier_info[idx]))))
    model2_matches.append(len(set(i).intersection(set(pred2_supplier_info[idx]))))

    l1_matches.append(len(set(i).intersection(l1)))
    l2_matches.append(len(set(i).intersection(l2)))
    l3_matches.append(len(set(i).intersection(l3)))
    l4_matches.append(len(set(i).intersection(l4)))

    l1_mismatches.append(len(set(i).difference(l1)))
    l2_mismatches.append(len(set(i).difference(l2)))
    l3_mismatches.append(len(set(i).difference(l3)))
    l4_mismatches.append(len(set(i).difference(l4)))

    model1_mismatches.append(len(set(i).difference(set(pred1_supplier_info[idx]))))
    model2_mismatches.append(len(set(i).difference(set(pred2_supplier_info[idx]))))

    total_length.append(len(i))

In [None]:
result = pd.DataFrame()

result['key'] = test['study_type'] + "@" + test['supplier_info']
result['actual'] =  total_length

result['11_matches'] = l1_matches
result['12_matches'] = l2_matches
result['13_matches'] = l3_matches
result['14_matches'] = l4_matches
result['model1_matches'] = model1_matches
result['model2_matches'] = model2_matches

result['l1_mismatches'] = l1_mismatches
result['l2_mismatches'] = l2_mismatches
result['l3_mismatches'] = l3_mismatches
result['l3_mismatches'] = l3_mismatches
result['model1_mismatches'] = model1_mismatches
result['model2_mismatches'] = model2_mismatches

In [None]:
result.sort_values(by='key')

Unnamed: 0,key,actual,11_matches,12_matches,13_matches,14_matches,model1_matches,model2_matches,l1_mismatches,l2_mismatches,l3_mismatches,model1_mismatches,model2_mismatches
1,b2b@it_decision_maker@oceania@eng,5,1,3,1,0,1,3,4,2,4,4,2
4,b2b@marketing_advertising@asia@eng,2,1,0,0,0,1,0,1,2,2,1,2
16,b2b@marketing_advertising@europe@ger,6,2,2,1,1,2,2,4,4,5,4,4
12,b2b@other@asia@jpn,1,0,0,0,0,0,0,1,1,1,1,1
9,b2b@other@europe@eng,6,1,4,3,0,1,4,5,2,3,5,2
13,b2b@other@south-america@por,1,1,1,0,0,1,1,0,0,1,0,0
18,b2b@technology@north-america@eng,1,1,0,0,0,1,0,0,1,1,0,1
15,consumer_study@entertainment@europe@ger,2,1,0,0,0,1,0,1,2,2,1,2
11,consumer_study@household@asia@eng,4,2,1,1,1,2,1,2,3,3,2,3
2,consumer_study@other@asia@eng,1,1,0,0,0,1,0,0,1,1,0,1


In [None]:
result.to_csv('test/output_comparision.csv', index=False)

In [60]:
test = test[['project_name', 'continent', 'languages', 'study_type', 'pred_supplier_ref']]
test.columns = ['project_name', 'continent', 'languages', 'study_type', 'model1_recommendations']
# test.drop(columns=['key'], inplace=True)

In [61]:
test.head()

Unnamed: 0,project_name,continent,languages,study_type,model1_recommendations
0,NA Airport W11,north-america,eng,consumer_study@travel,"[534, 537, 273, 500, 271, 565, 549, 359, 600, ..."
1,Bid 14776 Venue DMs 6.14.22,europe,eng,b2b@other,"[537, 534, 273, 500, 271, 590, 549, 601, 588, ..."
2,"Australia, M&F, 18-44 & US, M&F, 18-65, homeow...",oceania,eng,consumer_study@other,"[537, 72, 534, 273, 500, 271, 590, 549, 601, 574]"
3,30k+ Per month HHI,asia,eng,consumer_study@other,"[537, 273, 500, 271, 448, 601, 600, 558, 113, ..."
4,Construction and Trades people,europe,eng,b2b@other,"[442, 293, 301, 500, 273, 574, 588, 572, 524, ..."


In [81]:
recsys = pd.read_csv('test/standlaone_recsys (Autosaved).csv')
recsys.columns = ['project_name', 'continent', 'languages', 'study_type', 'model2_recommendations']


In [82]:
new = list(set(test.project_name.unique().tolist()).difference(set(recsys.project_name.unique().tolist())))

project = []
study_type = []
supplier_info = []
actual_supplier_info = []
pred1_supplier_info = []
pred2_supplier_info = []
total_layers = []

for j in IR_rec[IR_rec.projects__name.isin(new)].itertuples():
    j = j[1:]
    supp = j[-1].strip('}{').split(',')
    final1 = infer(j[-3], j[-2])
    # final2 = infer2(j[-3], j[-2])
    # final3 = infer3(j[-3], j[-2])

    project.append(j[0])
    study_type.append(j[-2])
    supplier_info.append(j[-3])
    # actual_supplier_info.append(j[-1])
    pred1_supplier_info.append(final1.supplier_ref.values.tolist())
    # pred2_supplier_info.append(final2.supplier_ref.values.tolist())
    # total_layers.append(final3)

In [83]:
# recsys[recsys.project_name.isin(test.project_name.values.tolist())]
new_test = pd.DataFrame()

new_test['project_name'] = pd.Series(project)
new_test['study_type'] = pd.Series(study_type)
new_test['supplier_info'] = pd.Series(supplier_info)
new_test['actual_supplier_info'] = pd.Series(actual_supplier_info)
new_test['pred_supplier_ref'] = pd.Series(pred1_supplier_info)
new_test['continent'] = new_test.supplier_info.apply(lambda x: x.split("@")[0])
new_test['languages'] = new_test.supplier_info.apply(lambda x: x.split("@")[1])
# test['pred2_supplier_info'] = pd.Series(pred2_supplier_info)

new_result = dict(new_test[['project_name', 'pred_supplier_ref']].values.tolist())

In [97]:
new_result_df = test[test.project_name.isin(new)]
new_result_df['model2_recommendations'] = new_result_df.project_name.apply(lambda x:new_result[x])

new_result_df = new_result_df[['project_name', 'continent', 'languages', 'study_type',
       'model2_recommendations']].reset_index(drop=True)

pd.concat([recsys, new_result_df]).reset_index(drop=True).to_csv('test/standalone_recsys.csv', index=False)


# test.merge(recsys, on='project_name', how='left').dropna()

# test = test[['project_name', 'continent_x', 'languages_x', 'study_type_x', 'model1_recommendations', 'model2_recommendations']]
# test.columns = ['project_name', 'continent', 'languages', 'study_type', 'model1_recommendations', 'model2_recommendations']

In [99]:
recsys = pd.read_csv('test/standalone_recsys.csv')
recsys.columns

Index(['project_name', 'continent', 'languages', 'study_type',
       'model2_recommendations'],
      dtype='object')

In [104]:
test = test[['project_name', 'continent', 'languages', 'study_type',
       'pred_supplier_ref']]

test.columns = ['project_name', 'continent', 'languages', 'study_type',
       'model1_recommendations']

test

Unnamed: 0,project_name,continent,languages,study_type,model1_recommendations
0,NA Airport W11,north-america,eng,consumer_study@travel,"[534, 537, 273, 500, 271, 565, 549, 359, 600, ..."
1,Bid 14776 Venue DMs 6.14.22,europe,eng,b2b@other,"[537, 534, 273, 500, 271, 590, 549, 601, 588, ..."
2,"Australia, M&F, 18-44 & US, M&F, 18-65, homeow...",oceania,eng,consumer_study@other,"[537, 72, 534, 273, 500, 271, 590, 549, 601, 574]"
3,30k+ Per month HHI,asia,eng,consumer_study@other,"[537, 273, 500, 271, 448, 601, 600, 558, 113, ..."
4,Construction and Trades people,europe,eng,b2b@other,"[442, 293, 301, 500, 273, 574, 588, 572, 524, ..."
5,"18+ Department: IT, Operations/production, Pu...",oceania,eng,b2b@it_decision_maker,"[442, 293, 301, 273, 271, 565, 500, 588, 574, ..."
6,Age 18-30 - Pernod Ricard Winemakers-PAT00009939,oceania,eng,consumer_study@other,"[537, 273, 271, 500, 565, 601, 600, 558, 448, ..."
7,Marketcast_Apple TV First Look Other # 9 DE,europe,ger,consumer_study@entertainment,"[537, 273, 271, 500, 565, 601, 600, 558, 448, ..."
8,Campaign Monitor- Spain post wave (P99272),europe,eng,consumer_study@other,"[537, 341, 273, 271, 500, 566, 588, 574, 524, ..."
9,parents of children,europe,eng,consumer_study@other,"[537, 72, 341, 271, 500, 590, 566, 601, 574, 524]"


In [108]:
final = recsys.merge(test, on='project_name', how='left').fillna('-')

In [111]:
final = final[['project_name', 'continent_x', 'languages_x', 'study_type_x', 'model1_recommendations', 'model2_recommendations']]

final.columns = ['project_name', 'continent', 'languages', 'study_type', 'model1_recommendations', 'model2_recommendations']

In [117]:
final.sort_values(by='continent').to_csv('test/final_validation.csv', index=False)