# Success Rate Evaluation
Evaluate success rate using weighted KNN (k-nearest neighbors)

## Sections
1. visualization of nearest neighbors
2. analogy using vector arithmetic: if mit is not in massachusetts but in california, which school would it be?
[mit] - [massachusetts] + [california] = ?
3. evaluation of category success rate using KNN with successful & unsuccessful categories frequency as weights

## Inputs
1. Word2Vec model
2. successful and unsuccessful company data

## Outputs
1. scatter plots
2. analogy answer
3. evaluation of entry success rate

v1.0: Liren SONG, Oxford, Dec 17 2021

In [1]:
from sklearn.decomposition import PCA
from gensim import models
from sklearn.manifold import TSNE
import numpy as np
import pandas as pd
import re
import string
import collections
import matplotlib.cm as cm
# plot with webagg
import matplotlib
matplotlib.use('WebAgg')
import matplotlib.pyplot as plt
plt.ion()

<matplotlib.pyplot._IonContext at 0x11ffd0d00>

In [2]:
# load in model
word2vec_path = 'liren_model_better.bin'
model = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
# perform simple test
model.most_similar('hawaii')

[('honolulu', 0.8657945394515991),
 ('oahu', 0.7792679071426392),
 ('florida', 0.7689505815505981),
 ('maui', 0.7519458532333374),
 ('anchorage', 0.749379575252533),
 ('miami', 0.7459452748298645),
 ('detroit', 0.7450781464576721),
 ('maine', 0.7429133653640747),
 ('alaska', 0.7380715608596802),
 ('southwest', 0.7372687458992004)]

## Section 1: visualization of nearest neighbors
please note that all the plots are suppressed into two dimension from high dimension, visual clue can be misleading

In [3]:
# all the plotting functions
def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.close('all')
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label, linewidths=5)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=1, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=15)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.show()


def similar_words_plot(keys):
    embedding_clusters = []
    word_clusters = []
    for word in keys:
        embeddings = []
        words = []
        if word in model:
            for similar_word, _ in model.most_similar(word, topn=15):
                words.append(similar_word)
                embeddings.append(model[similar_word])
            embedding_clusters.append(embeddings)
            word_clusters.append(words)

    embedding_clusters = np.array(embedding_clusters)
    n, m, k = embedding_clusters.shape
    tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
    embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

    tsne_plot_similar_words('Similar words plot', keys, embeddings_en_2d, word_clusters, 0.7)

def words_scatterplot(plot_model, input_words, label=True):
    plt.close('all')
    plt.figure(figsize=(6,6))
    word_vectors = np.array([model[w] for w in input_words if w in plot_model])
    twodim = PCA().fit_transform(word_vectors)[:,:2]
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    if label:
        for word, (x,y) in zip(input_words, twodim):
            plt.text(x+0.05, y+0.05, word)
    plt.show()

def successful_unsuccessful_plot(plot_model, words_successful, words_unsuccessful, label=False):
    plt.close('all')
    plt.figure(figsize=(6, 6))
    for words in words_successful, words_unsuccessful:
        word_vectors = np.array([plot_model[w] for w in words if w in plot_model])
        twodim = PCA().fit_transform(word_vectors)[:, :2]
        if words == words_successful:
            plt.scatter(twodim[:, 0], twodim[:, 1], edgecolors='k', c='r')
            if label:
                for word, (x, y) in zip(words, twodim):
                    plt.text(x + 0.05, y + 0.05, f'{word}_successful')
        elif words == words_unsuccessful:
            plt.scatter(twodim[:, 0], twodim[:, 1], edgecolors='k', c='g')
            if label:
                for word, (x, y) in zip(words, twodim):
                    plt.text(x + 0.05, y + 0.05, f'{word}_unsuccessful')
    plt.show()


In [23]:
similar_words_plot(['stanford', 'cow', 'tesla', 'uber'])

*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.
*c* argument looks like a single numeric RGB or R

In [None]:
words_scatterplot(model,['stanford', 'hardware', 'mit', 'software'])

## Section 2: analogy using vector arithmetic

In [4]:
def analogy(x1, x2, x3):
    """
    with the rule:
    [x1] + [x2] - [x3] = result
    example:
    [king] + [women] - [men] = [queen]
    """
    result = model.most_similar(positive=[x1, x2], negative=[x3])
    return result[0][0]

In [11]:
analogy('mit', 'california', 'massachusetts')

'cmu'

## Section 3: evaluation of category success rate using KNN

In [5]:
# load in successful and unsuccessful data
df_successful = pd.read_csv('Moneyball_Successful_Companies.csv')
df_unsuccessful = pd.read_csv('Moneyball_UnsuccessfulCompanies.csv')

In [6]:
def clean_sentences(text):
    """Make text lowercase, remove punctuation and remove words containing numbers."""
    text = re.sub(r'[^\w]', ' ', text)  # clear all things except underscore and alphanumeric
    text = re.sub(" \d+", " ", text)  # clear all digits
    text = text.lower()  # lower all text

    # replace the word 'and'
    # todo: this part can be further fine tuned
    patterns = ['and']
    for pattern in patterns:
        text = re.sub(pattern, '', text)
    return text

def filter_list(df):
    df = df[df['country_code'] == 'USA']
    category_list = clean_sentences(str(list(df['category_list']))).split()
    filtered_category_list = []
    for word in category_list:
        if len(word) > 1:
            filtered_category_list += [word]
    return filtered_category_list

def generate_frequency_list(df_one, df_two):
    idx = 1
    for df in df_one, df_two:
        filtered_category_list = filter_list(df)
        counter = collections.Counter(filtered_category_list)
        if idx == 1:
            frequency_list_one = pd.DataFrame(counter.most_common())
        else:
            frequency_list_two = pd.DataFrame(counter.most_common())
        idx += 1
    return frequency_list_one, frequency_list_two


In [7]:
# generate frequency(count) dataframe for both successful and unsuccessful categories
df_successful_frequency, df_unsuccessful_frequency = generate_frequency_list(df_successful, df_unsuccessful)

In [10]:
df_successful_frequency["frequency"] = df_successful_frequency.iloc[:, 1]/sum(df_successful_frequency.iloc[:, 1])

In [11]:
df_successful_frequency

Unnamed: 0,0,1,frequency
0,software,519,0.102185
1,enterprise,205,0.040362
2,saas,165,0.032487
3,cloud,142,0.027958
4,data,141,0.027761
...,...,...,...
376,cycling,1,0.000197
377,garden,1,0.000197
378,improvement,1,0.000197
379,fleet,1,0.000197


In [12]:
df_unsuccessful_frequency["frequency"] = df_unsuccessful_frequency.iloc[:, 1]/sum(df_unsuccessful_frequency.iloc[:, 1])

In [13]:
df_unsuccessful_frequency

Unnamed: 0,0,1,frequency
0,software,1152,0.108322
1,mobile,430,0.040433
2,enterprise,317,0.029807
3,apps,284,0.026704
4,technology,284,0.026704
...,...,...,...
518,pet,1,0.000094
519,veterinary,1,0.000094
520,residential,1,0.000094
521,drone,1,0.000094


In [10]:
successful_unsuccessful_plot(model, list(df_successful_frequency[0]), list(df_unsuccessful_frequency[0]))

Press Ctrl+C to stop WebAgg server


RuntimeError: This event loop is already running

In [11]:
# initialize weight dataframe with keys
keys = list(model.index_to_key)
df_keys = pd.DataFrame({"keys": keys, 'successful_weights': 0, 'unsuccessful_weights': 0})
df_keys

Unnamed: 0,keys,successful_weights,unsuccessful_weights
0,and,0,0
1,the,0,0
2,to,0,0
3,of,0,0
4,a,0,0
...,...,...,...
61975,subvert,0,0
61976,homeshare,0,0
61977,sharpshooter,0,0
61978,overabundance,0,0


In [12]:
# find frequency weights
def find_weights(keys_dataframe, df_frequency):
    key_index = []
    weight_list = []
    for i in range(0, len(df_frequency[0])):
        index = keys_dataframe.index[keys_dataframe['keys'] == df_frequency.iloc[i, 0]].tolist()
        key_index += index
        # sometimes, depends on the training set, the category might not be in the keys set
        # check if the list is empty
        if len(index) == 1:
            weight = [df_frequency.iloc[i, 1]]
            weight_list += weight
        elif len(index) == 0:
            print(f'{df_frequency.iloc[i, 0]} not in key set')
        else:
            print(f'{df_frequency.iloc[i, 0]} is repeated in key set')
    return key_index, weight_list

# found how many keys are in the key set
def key_in_set_ratio(key_index, df_frequency):
    ratio = len(key_index)/len(df_frequency[0])
    print(ratio)


In [13]:
successful_key_index, successful_weight_list = find_weights(df_keys, df_successful_frequency)
unsuccessful_key_index, unsuccessful_weight_list = find_weights(df_keys, df_unsuccessful_frequency)

computing not in key set
computer not in key set
commerce not in key set
roid not in key set
telecommunications not in key set
communications not in key set
commercial not in key set
compliance not in key set
communication not in key set
comparison not in key set
commerce not in key set
computing not in key set
computer not in key set
roid not in key set
telecommunications not in key set
communities not in key set
communications not in key set
compliance not in key set
commercial not in key set
comparison not in key set
communication not in key set
homel not in key set
accommodations not in key set
hmade not in key set
lscaping not in key set


In [14]:
# check key in set ratio
[key_in_set_ratio(successful_key_index, df_successful_frequency), key_in_set_ratio(unsuccessful_key_index, df_unsuccessful_frequency)]

0.973753280839895
0.97131931166348


[None, None]

In [15]:
# assign weights to keys dataframe
df_keys.iloc[successful_key_index, 1] = successful_weight_list
df_keys.iloc[unsuccessful_key_index, 2] = unsuccessful_weight_list
df_keys["success_frequency"] = df_keys.successful_weights/sum(df_keys.successful_weights)
df_keys["unsuccess_frequency"] = df_keys.unsuccessful_weights/sum(df_keys.unsuccessful_weights)
df_keys

Unnamed: 0,keys,successful_weights,unsuccessful_weights,success_frequency,unsuccess_frequency
0,and,0,0,0.000000,0.000000
1,the,0,0,0.000000,0.000000
2,to,3,0,0.000610,0.000000
3,of,20,54,0.004069,0.005274
4,a,0,0,0.000000,0.000000
...,...,...,...,...,...
61975,subvert,0,0,0.000000,0.000000
61976,homeshare,0,0,0.000000,0.000000
61977,sharpshooter,0,0,0.000000,0.000000
61978,overabundance,0,0,0.000000,0.000000


In [16]:
success_sum = df_keys.successful_weights.sum()
success_sum
unsuccess_sum = df_keys.unsuccessful_weights.sum()
unsuccess_sum

10238

In [17]:
def catagory_evaluation(word):
    """
    using knn, in this case k=(the number of successful & unsuccessful weights)
    the more positive the result, the more likely the success is
    you can enter any words, but the closer the word to the data you train, the most accurate the result
    """
    try:
        value_list = model.most_similar(word, topn=None)
        # print(value_list)
        # score is given by the relative closeness to successful minus closeness to unsuccessful
        score = sum(value_list*(df_keys.success_frequency-df_keys.unsuccess_frequency))
    except:
        # if the entry is not in the key set, return 0
        score = 0
        print(f'{word} not in key set')
    return score

In [18]:
df_evalution = pd.DataFrame({
    "keys": df_keys["keys"]
})

In [19]:
# evaluation = []
# for key in df_evalution["keys"]:
#     evaluation += [catagory_evaluation(key)]

In [20]:
# df_evalution["evaluation"] = evaluation

In [21]:
# df_evalution

Unnamed: 0,keys,evaluation
0,and,-0.001323
1,the,-0.002725
2,to,0.002387
3,of,-0.001911
4,a,-0.007624
...,...,...
61975,subvert,-0.005514
61976,homeshare,-0.002248
61977,sharpshooter,0.018649
61978,overabundance,0.023037


In [45]:
# df_evalution.to_csv("df_evaluation.csv", index=False)

In [None]:
df_evalution = pd.read_csv("df_evaluation.csv")

In [29]:
evaluation_sorted = df_evalution.sort_values(by=["evaluation"], ascending=False).reset_index(drop=True)

In [30]:
evaluation_sorted

Unnamed: 0,keys,evaluation
0,cip,0.048574
1,operational,0.046773
2,quantum_ai,0.046374
3,cybersecurity,0.045827
4,grc,0.045162
...,...,...
61975,movie,-0.056665
61976,storybooks,-0.056700
61977,avn,-0.057348
61978,music,-0.057852


In [44]:
[catagory_evaluation('restaurant'), catagory_evaluation('casino'), catagory_evaluation('robotics'),catagory_evaluation('lidar'), catagory_evaluation("california"), catagory_evaluation("self"), catagory_evaluation("driving"), catagory_evaluation("autonomous")]

[-0.01026026690235683,
 -0.02260417298816972,
 0.02016395619139684,
 0.014828108225301381,
 0.007638495348504759,
 0.012965968718111063,
 0.0028437564015328593,
 0.033053576657560015]

In [222]:
sum([catagory_evaluation('nyu'), catagory_evaluation('hawaii'),catagory_evaluation('cow')])

-0.01943003850796389

In [223]:
[catagory_evaluation('stanford'),catagory_evaluation('california'), catagory_evaluation('quantum')]

[0.001420880246299806, 0.007638495348504759, 0.03203302220546852]

In [224]:
sum([catagory_evaluation('stanford'),catagory_evaluation('california'), catagory_evaluation('quantum')])

0.04109239780027309

In [26]:
all_category_list = list(set(df_successful_frequency[0]).union(set(df_unsuccessful_frequency[0])))

In [27]:
all_category_list

['wireless',
 'location',
 'big',
 'biotechnology',
 'distribution',
 'apis',
 'language',
 'casual',
 'baby',
 'machine',
 'guides',
 'veterinary',
 'battery',
 'sales',
 'news',
 'market',
 'public',
 'auctions',
 'saas',
 'cannabis',
 'accounting',
 'measurement',
 'peer',
 'esports',
 'broadcasting',
 'funds',
 'roid',
 'capital',
 'communication',
 'it',
 'benefits',
 'machinery',
 'search',
 'dental',
 'ehr',
 'games',
 'fuel',
 'erp',
 'retail',
 'risk',
 'blockchain',
 'agtech',
 'wholesale',
 'identity',
 'diagnostics',
 'ride',
 'privacy',
 'mmo',
 'higher',
 'document',
 'entertainment',
 'flash',
 'graphic',
 'registrar',
 'auto',
 'green',
 'event',
 'procurement',
 'outsourcing',
 'hosting',
 'toys',
 'plastics',
 'presentation',
 'pharmaceutical',
 'development',
 'primary',
 'furniture',
 'personal',
 'asset',
 'level',
 'craft',
 'hotel',
 'of',
 'creative',
 'augmented',
 'farming',
 'console',
 'wellness',
 'software',
 'advanced',
 'currency',
 'gift',
 'on',
 'base

In [31]:
all_score_list = []
for key in all_category_list:
    try:
        all_score_list += [float(evaluation_sorted[evaluation_sorted["keys"]==key]["evaluation"])]
    except:
        all_score_list += [0]
        print(f'{key} not in key set')



roid not in key set
communication not in key set
communities not in key set
communications not in key set
lscaping not in key set
comparison not in key set
hmade not in key set
telecommunications not in key set
computer not in key set
compliance not in key set
accommodations not in key set
commerce not in key set
commercial not in key set
homel not in key set
computing not in key set


In [32]:
all_score_list

[0.007721769959419898,
 0.0010528173198953968,
 0.006292957347824427,
 0.00641111285430538,
 -0.009576994246236798,
 0.012504154841549023,
 -0.006652288945519947,
 -0.04164769116005356,
 -0.018255121452753587,
 0.02555270917095597,
 -0.023746533481577677,
 0.007957409388194754,
 0.01363244361415712,
 -0.010233407078634908,
 -0.029160778020045918,
 0.005589017766664015,
 0.00500536202060606,
 -0.023178511027342554,
 0.018497293902657853,
 0.004116784514782831,
 0.03278968772890906,
 0.016617523222675748,
 -0.0003213493658301582,
 -0.03649100181628036,
 -0.03929979439665975,
 0.013139591592911995,
 0,
 0.02368777479428534,
 0,
 0.0012638066610177149,
 0.015427174235192713,
 0.019124277012386275,
 -0.028572976438055304,
 0.005786313436830734,
 0.019777410917648553,
 -0.04914141711026971,
 0.0169913330964469,
 0.030289416293253164,
 -0.002690797181646145,
 0.03616593893641917,
 0.021858360494559588,
 0.015301289476278145,
 -0.001998045996291723,
 0.0030673660855742814,
 0.01536488344580937

In [33]:
all_category_score = pd.DataFrame({
    "category" : all_category_list,
    "score": all_score_list
})

In [35]:
all_category_sorted = all_category_score.sort_values(by=["score"], ascending=False).reset_index(drop=True)

In [36]:
all_category_sorted

Unnamed: 0,category,score
0,cyber,0.039050
1,infrastructure,0.038430
2,security,0.038391
3,virtualization,0.036239
4,risk,0.036166
...,...,...
553,podcast,-0.050552
554,classifieds,-0.052490
555,celebrity,-0.052540
556,tv,-0.053900


In [225]:
def top_n_average_calculation(df_frequency, n_start,  n_end):
    rate_list = []
    for catagory in df_frequency[0][n_start:n_end]:
        rate = catagory_evaluation(catagory)
        rate_list += [rate]
    average = sum(rate_list) / len(rate_list)
    print(rate_list)
    return average

def fact_check(df_frequency, entry_number_start, entry_number_end):
    """check if the mean score of successful is bigger than that of unsuccessful"""
    average = top_n_average_calculation(df_frequency, entry_number_start,  entry_number_end)
    print(f"top {entry_number_start} to top {entry_number_end}  category mean score: {average}")


In [228]:
fact_check(df_successful_frequency, 30, 40)

computer not in key set
commerce not in key set
[0.039050185923267325, 0.02016395619139684, 0, -0.002524729037488264, -0.00022314029811629368, 0, -0.042734279784211474, 0.012538207332085693, -0.023014147457046566, 0.021282276125108333]
top 30 to top 40  category mean score: 0.0024538328994995594


In [229]:
fact_check(df_unsuccessful_frequency, 30, 40)

[0.021282276125108333, 0.02555270917095597, -0.023014147457046566, -0.02848568383780327, -0.022432221058642843, -0.008495065566788418, 0.029901948873985635, 0.009152329915628473, 0.011955514276397742, -0.0019110766531534658]
top 30 to top 40  category mean score: 0.0013506583788641595
