In [1]:
#import relevant packages
import pandas as pd
import numpy as np
import pickle as pk
import pycountry_convert as pc

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances

pd.options.display.max_colwidth = 500

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\junaid.shaikh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#INFERENCE parameters 
req_country = 'aus'
req_lang = 'eng'
req_study_type = 'B2B'
req_subject = 'other'

req_continent = pc.convert_continent_code_to_continent_name(pc.country_alpha2_to_continent_code(pc.country_name_to_country_alpha2(req_country.upper())))

In [3]:
#Data for matching result index
DATAPATH = 'dfs/final_transformed_data.csv' 

df_original = pd.read_csv(DATAPATH)
#df = df[~df.overview.isna()]
print(len(df_original))
print(df_original.info())
print(df_original.head(5))

13344
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13344 entries, 0 to 13343
Data columns (total 86 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   supplier_sample_pulls__created_at               13344 non-null  int64  
 1   supplier_sample_pulls__sampling_method          13344 non-null  object 
 2   sample_pulls__target_groups                     13344 non-null  object 
 3   sample_pulls__country                           13344 non-null  object 
 4   sample_pulls__language                          13344 non-null  object 
 5   sample_pulls__cost_per_interview                13344 non-null  float64
 6   sample_pulls__incident_rate                     13344 non-null  float64
 7   sample_pulls__length_of_interview               13344 non-null  int64  
 8   sample_pulls__completes_needed                  13344 non-null  int64  
 9   sample_pulls__start_date         

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
#filter wise list of sample projects to refer for requirement parameters for test inference.
df = df_original.query("sample_pulls__start_date in ('2022-06-20','2022-06-21','2022-06-22')")
df = df.query("sample_pulls__country != 'usa'")
df = df.query("projects__study_types_ids in ('B2B','Consumer Study')")
df_inference_project = df.drop_duplicates(subset=['projects__id'])
df_inference_project.info()
df_inference_project[['sample_pulls__start_date','projects__name','sample_pulls__country','continent','sample_pulls__language','projects__study_types_ids','projects__study_types_subject_ids','sentence_1','sentence_2']].to_csv('dfs/unique_project.csv', index=False)

In [4]:
#STOPWORDS = set(stopwords.words('english'))
MIN_WORDS = 4
MAX_WORDS = 200

def tokenizer(sentence, min_words=MIN_WORDS, max_words=MAX_WORDS, stopwords=None, lemmatize=False):
    """
    Lemmatize, tokenize, crop and remove stop words.
    """
    
    tokens = [w for w in word_tokenize(sentence)]
    print(tokens)
    '''
    token = [w for w in tokens if (len(w) > min_words and len(w) < max_words
                                                        and w not in stopwords)]
    '''
    return tokens

def extract_best_indices(m, topk, mask=None):
    """
    Use sum of the cosine distance over all tokens.
    m (np.array): cos matrix of shape (nb_in_tokens, nb_dict_tokens)
    topk (int): number of indices to return (from high to lowest in order)
    """
    # return the sum on all tokens of cosinus for each sentence
    if len(m.shape) > 1:
        cos_sim = np.mean(m, axis=0) 
    else: 
        cos_sim = m
    index = np.argsort(cos_sim)[::-1] # from highest idx to smallest score 
    if mask is not None:
        assert mask.shape == m.shape
        mask = mask[index]
    else:
        mask = np.ones(len(cos_sim))
    mask = np.logical_or(cos_sim[index] != 0, mask) #eliminate 0 cosine distance
    best_index = index[mask][:topk]  
    return best_index

def get_recommendations_tfidf(vectorizer, sentence, tfidf_mat):
    
    """
    Return the database sentences in order of highest cosine similarity relatively to each 
    token of the target sentence. 
    """
    # Embed the query sentence
    tokens = [str(tok) for tok in tokenizer(sentence)]
    vec = vectorizer.transform(tokens)
    # Create list with similarity between query and dataset
    mat = cosine_similarity(vec, tfidf_mat)
    # Best cosine distance for each token independantly
    print(mat.shape)
    best_index = extract_best_indices(mat, topk=20)
    return best_index

#Load saved model tfidf embedding
def load_tfidf_embedding(sentence):
    with open(f'model/tfidf_model_{sentence}.pkl', 'rb') as fn:
        model = pk.load(fn)
    return model

#Load saved model tfidf vectorizer
def load_tfidf_vectorizer(vectorizer):
    with open(f'model/tfidf_{vectorizer}.pkl', 'rb') as fn:
        model = pk.load(fn)
    return model

In [5]:
tfidf_embedding_1 = load_tfidf_embedding('sentence_1')
tfidf_vectorizer_1 = load_tfidf_vectorizer('vectorizer_1')
tfidf_embedding_2 = load_tfidf_embedding('sentence_2')
tfidf_vectorizer_2 = load_tfidf_vectorizer('vectorizer_2')

In [None]:
#Inference data transformation methods

def map_continent(df_infer):

    print(f'mapping country to continent...')
    df_infer['continent'] = df_infer.apply(lambda x: pc.convert_continent_code_to_continent_name(pc.country_alpha2_to_continent_code(pc.country_name_to_country_alpha2(x['sample_pulls__country'].upper()))), axis=1)

def combine_lang_study(df_infer):

    print(f'combining language and study columns...')
    df_infer['language_study_combine'] = df_infer['sample_pulls__language'].map(str) + ' ' + df_infer['projects__study_types_ids'].map(str)

def combine_continent_lang_study(df_infer):

    print(f'combining continent-language-study columns...')
    df_infer['continent_language_study_combine'] = df_infer['continent'].map(str) + ' ' + df_infer['sample_pulls__language'].map(str) + ' ' + df_infer['projects__study_types_ids'].map(str)


In [9]:
#Calculate match-prediction for sentence_1, for given req parameters
query_sentence_1 = req_continent + ' ' + req_lang + ' ' + req_study_type

best_tfidf_index_1 = get_recommendations_tfidf(tfidf_vectorizer_1, query_sentence_1, tfidf_embedding_1)
matched_df_1 = df_original[['rl','projects__name','sample_pulls__country','continent','sample_pulls__language','projects__study_types_ids','projects__study_types_subject_ids']].iloc[best_tfidf_index_1]
display(matched_df_1)

['Oceania', 'eng', 'B2B']
['oceania']
['eng']
['b2b']
(3, 13344)


Unnamed: 0,rl,projects__name,sample_pulls__country,continent,sample_pulls__language,projects__study_types_ids,projects__study_types_subject_ids
264,591,PR-148511.1_ISV Decision Makers,nzl,Oceania,eng,B2B,it_decision_maker
4141,601,SBOs and FDMs with the company size of 200 EE or less,nzl,Oceania,eng,B2B,banking_financial
4139,271,SBOs and FDMs with the company size of 200 EE or less,nzl,Oceania,eng,B2B,banking_financial
9226,271,Decision Makers in Australia,aus,Oceania,eng,B2B,other
9225,537,Decision Makers in Australia,aus,Oceania,eng,B2B,other
9224,600,Decision Makers in Australia,aus,Oceania,eng,B2B,other
9223,113,Decision Makers in Australia,aus,Oceania,eng,B2B,other
9222,528,Decision Makers in Australia,aus,Oceania,eng,B2B,other
9221,588,Decision Makers in Australia,aus,Oceania,eng,B2B,other
9220,72,Decision Makers in Australia,aus,Oceania,eng,B2B,other


In [11]:
#Calculate match-prediction for sentence_2, for given req parameters
query_sentence_2 = "{'AGE': ['30-45']}"

best_tfidf_index_2 = get_recommendations_tfidf(tfidf_vectorizer_2, query_sentence_2, tfidf_embedding_2)
matched_df_2 = df_original[['rl','projects__name','sample_pulls__country','continent','sample_pulls__language','projects__study_types_ids','projects__study_types_subject_ids','projects__target_groups_qualifications_combine']].iloc[best_tfidf_index_2]
display(matched_df_2)

['{', "'AGE", "'", ':', '[', "'30-45", "'", ']', '}']
['{']
["'age"]
["'"]
[':']
['[']
["'30-45"]
["'"]
[']']
['}']
(9, 13344)


Unnamed: 0,rl,projects__name,sample_pulls__country,continent,sample_pulls__language,projects__study_types_ids,projects__study_types_subject_ids,projects__target_groups_qualifications_combine
0,601,Type 1 Diabetes Patients across UK,gbr,Europe,eng,Healthcare,patient,{'AGE': ['18-99']}
2698,271,Workspace Migrate,jpn,Asia,eng,B2B,it_decision_maker,{'AGE': ['18-99']}
8216,271,T220041 US Cellular SBO,usa,North America,eng,B2B,operations,{'AGE': ['18-99']}
8217,271,T220041 US Cellular SBO,usa,North America,eng,B2B,technology,{'AGE': ['18-99']}
8218,537,T220041 US Cellular SBO,usa,North America,eng,B2B,operations,{'AGE': ['18-99']}
8219,537,T220041 US Cellular SBO,usa,North America,eng,B2B,technology,{'AGE': ['18-99']}
8220,537,"Survey for kids, teens and adults (2022.D1.051322)",usa,North America,eng,Consumer Study,household,{'AGE': ['18-99']}
8221,271,"Survey for kids, teens and adults (2022.D1.051322)",usa,North America,eng,Consumer Study,household,{'AGE': ['18-99']}
8228,524,DMs in Shipping Sector,nor,Europe,eng,B2B,other,{'AGE': ['18-99']}
8229,592,DMs in Shipping Sector,nor,Europe,eng,B2B,other,{'AGE': ['18-99']}


In [None]:
#combining 1 & 2 list and dropping duplicate supplier rl values.
rl_list = []
rl_list = matched_df_1['rl'].tolist()
rl_list.extend(matched_df_2['rl'].tolist())
rl_list_unique = list(set(rl_list))

In [None]:
#prepare parameters to be sent to next recommendation pipeline.
request_param_dict = {}
request_param_dict['supplier_rl'] = rl_list_unique
request_param_dict['req_study_type'] = req_study_type
request_param_dict['req_subject'] = req_subject
request_param_dict['req_lang'] = req_lang
request_param_dict['req_country'] = req_country
request_param_dict['req_continent'] = req_continent

In [None]:
print(rl_list)
print(rl_list_unique)
print(request_param_dict)

In [None]:
#Calculate match-prediction for sentence_2
#query_sentence_2 = "{'GENDER': ['Female'], 'AGE': ['30-40']}"
query_sentence_2 = "{'AGE': ['18-34,45-65']}"
#query_sentence_2 = "{'AGE': ['25-65']}"
#query_sentence_2 = "{'GENDER': ['Female', 'Male'], 'AGE': ['18-54']}"
#query_sentence_2 = "{'GENDER': ['Female', 'Male'], 'AGE': ['18-30']}"
#query_sentence_2 = "STANDARD_COMPANY_DEPARTMENT': ['Technology Implementation', 'Finance\\/Accounting', 'Operations', 'Procurement'], 'STANDARD_EMPLOYMENT': ['Self-employed full-time', 'Employed part-time', 'Employed full-time'], 'GENDER': ['Female', 'Male'], 'AGE': ['18-99']}"
#query_sentence_2 = "{'STANDARD_INDUSTRY_PERSONAL': ['Carpenting\\/Electrical installations\\/VVS', 'Construction'], 'STANDARD_EMPLOYMENT': ['Self-employed part-time', 'Self-employed full-time', 'Employed part-time', 'Employed full-time'], 'GENDER': ['Female', 'Male'], 'AGE': ['18-70']}"
#query_sentence_2 = "{'STANDARD_HHI': ['$200,000 to $249,999', '$150,000 to $174,999', '$55,000 to $59,999', '$125,000 to $149,999', '$250,000 and above', '$100,000 to $124,999', '$95,000 to $99,999', '$70,000 to $74,999', '$85,000 to $89,999', '$80,000 to $84,999', '$65,000 to $69,999', '$40,000 to $44,999', '$45,000 to $49,999', '$75,000 to $79,999', '$90,000 to $94,999', '$50,000 to $54,999', '$175,000 to $199,999', '$35,000 to $39,999', '$30,000 to $34,999', '$60,000 to $64,999'], 'AGE': ['18-99']}"
#query_sentence_2 = "{'DMA': ['GREENVILLE-SPARTA-ASHEVILLE | 567', 'PARKERSBURG | 597', 'COLUMBIA, SC | 546', 'CHARLOTTESVILLE | 584', 'GREENWOOD-GREENVILLE | 647', 'PHOENIX-PRESCOTT | 753'], 'GENDER': ['Female', 'Male'], 'AGE': ['18-65']}"
#query_sentence_2 = "{'STANDARD_B2B_DECISION_MAKER': ['Other', 'Shipping', 'IT Software', 'Sales', 'Legal services', 'Marketing\\/Advertising', 'Office supplies', 'Auto leasing\\/purchasing', 'Food services', 'Corporate travel', 'Human Resources', 'Financial Department', 'IT Hardware', 'Telecommunications', 'Security', 'Operations', 'Printers and copiers'], 'STANDARD_JOB_TITLE': ['Manager (Group Manager, Sr. Manager, Manager, Program Manager)', 'Director (Group Director, Sr. Director, Director)', 'C-Level (e.g. CEO, CFO), Owner, Partner, President', 'Vice President (EVP, SVP, AVP, VP)'], 'STANDARD_EMPLOYMENT': ['Self-employed full-time', 'Employed part-time', 'Employed full-time'], 'GENDER': ['Female', 'Male'], 'AGE': ['25-99']}"
#query_sentence_2 = "{'STANDARD_PRIMARY_DECISION_MAKER': ['Yes', 'Share decisions equally'], 'AGE': ['18-99']}"
best_tfdif_index_2 = get_recommendations_tfidf(tfidf_vectorizer_2, query_sentence_2, tfidf_embedding_2)
df_suggestion_list = df_original[['rl','sentence_2']].iloc[best_tfdif_index_2]
df_suggestion_list = df_suggestion_list.drop_duplicates(subset=['rl'])
df_suggest_final = df_suggestion_list[['rl']]
df_suggest_final['projects__name'] = 'parents of children'
df_suggest_final['req_study_type'] = 'Consumer Study'
df_suggest_final['req_subject'] = 'other'
df_suggest_final['req_lang'] = 'eng'
df_suggest_final['req_country'] = 'gbr'
df_suggest_final['req_continent'] = 'Europe'
df_suggest_final.to_csv('dfs/suggested_suppliers.csv', index=False)
