# Success Rate Evaluation
Evaluate success rate using weighted KNN (k-nearest neighbors)

## Sections
1. visualization of nearest neighbors
2. analogy using vector arithmetic: if mit is not in massachusetts but in california, which school would it be?
[mit] - [massachusetts] + [california] = ?
3. evaluation of category success rate using KNN with successful & unsuccessful categories frequency as weights

## Inputs
1. Word2Vec model
2. successful and unsuccessful company data

## Outputs
1. scatter plots
2. analogy answer
3. evaluation of entry success rate

v1.0: Liren SONG, Oxford, Dec 17 2021

In [1]:
from sklearn.decomposition import PCA
from gensim import models
from sklearn.manifold import TSNE
import numpy as np
import pandas as pd
import re
import string
import collections
import matplotlib.cm as cm
# plot with webagg
import matplotlib
matplotlib.use('WebAgg')
import matplotlib.pyplot as plt
plt.ion()

<matplotlib.pyplot._IonContext at 0x105b82790>

In [2]:
# load in model
word2vec_path = 'liren_model_better.bin'
model = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
# perform simple test
model.most_similar('stanford')

[('harvard', 0.8769738078117371),
 ('cornell', 0.8724834322929382),
 ('cmu', 0.8669784069061279),
 ('yale', 0.8644719123840332),
 ('carnegie', 0.8591278791427612),
 ('iit', 0.8286830186843872),
 ('mit', 0.8273409008979797),
 ('tsinghua', 0.8253138661384583),
 ('purdue', 0.8185825943946838),
 ('ucsd', 0.8040269017219543)]

## Section 1: visualization of nearest neighbors
please note that all the plots are suppressed into two dimension from high dimension, visual clue can be misleading

In [3]:
# all the plotting functions
def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.close('all')
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=8)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.show()


def similar_words_plot(keys):
    embedding_clusters = []
    word_clusters = []
    for word in keys:
        embeddings = []
        words = []
        if word in model:
            for similar_word, _ in model.most_similar(word, topn=30):
                words.append(similar_word)
                embeddings.append(model[similar_word])
            embedding_clusters.append(embeddings)
            word_clusters.append(words)

    embedding_clusters = np.array(embedding_clusters)
    n, m, k = embedding_clusters.shape
    tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
    embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

    tsne_plot_similar_words('Similar words plot', keys, embeddings_en_2d, word_clusters, 0.7)

def words_scatterplot(plot_model, input_words, label=True):
    plt.close('all')
    plt.figure(figsize=(6,6))
    word_vectors = np.array([model[w] for w in input_words if w in plot_model])
    twodim = PCA().fit_transform(word_vectors)[:,:2]
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    if label:
        for word, (x,y) in zip(input_words, twodim):
            plt.text(x+0.05, y+0.05, word)
    plt.show()

def successful_unsuccessful_plot(plot_model, words_successful, words_unsuccessful, label=False):
    plt.close('all')
    plt.figure(figsize=(6, 6))
    for words in words_successful, words_unsuccessful:
        word_vectors = np.array([plot_model[w] for w in words if w in plot_model])
        twodim = PCA().fit_transform(word_vectors)[:, :2]
        if words == words_successful:
            plt.scatter(twodim[:, 0], twodim[:, 1], edgecolors='k', c='r')
            if label:
                for word, (x, y) in zip(words, twodim):
                    plt.text(x + 0.05, y + 0.05, f'{word}_successful')
        elif words == words_unsuccessful:
            plt.scatter(twodim[:, 0], twodim[:, 1], edgecolors='k', c='g')
            if label:
                for word, (x, y) in zip(words, twodim):
                    plt.text(x + 0.05, y + 0.05, f'{word}_unsuccessful')
    plt.show()


In [16]:
similar_words_plot(['stanford', 'hardware'])

*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.


In [5]:
words_scatterplot(model,['stanford', 'hardware', 'mit', 'software'])

## Section 2: analogy using vector arithmetic

In [7]:
def analogy(x1, x2, x3):
    """
    with the rule:
    [x1] + [x2] - [x3] = result
    example:
    [king] + [women] - [men] = [queen]
    """
    result = model.most_similar(positive=[x1, x2], negative=[x3])
    return result[0][0]

In [15]:
analogy('mit', 'california', 'massachusetts')

'cmu'

## Section 3: evaluation of category success rate using KNN

In [None]:
# load in successful and unsuccessful data
df_successful = pd.read_csv('Moneyball_Successful_Companies.csv')
df_unsuccessful = pd.read_csv('Moneyball_UnsuccessfulCompanies.csv')

In [None]:
def clean_sentences(text):
    """Make text lowercase, remove punctuation and remove words containing numbers."""
    text = re.sub(r'[^\w]', ' ', text)  # clear all things except underscore and alphanumeric
    text = re.sub(" \d+", " ", text)  # clear all digits
    text = text.lower()  # lower all text

    # replace the word 'and'
    # todo: this part can be further fine tuned
    patterns = ['and']
    for pattern in patterns:
        text = re.sub(pattern, '', text)
    return text

def filter_list(df):
    df = df[df['country_code'] == 'USA']
    category_list = clean_sentences(str(list(df['category_list']))).split()
    filtered_category_list = []
    for word in category_list:
        if len(word) > 1:
            filtered_category_list += [word]
    return filtered_category_list

def generate_frequency_list(df_one, df_two):
    idx = 1
    for df in df_one, df_two:
        filtered_category_list = filter_list(df)
        counter = collections.Counter(filtered_category_list)
        if idx == 1:
            frequency_list_one = pd.DataFrame(counter.most_common())
        else:
            frequency_list_two = pd.DataFrame(counter.most_common())
        idx += 1
    return frequency_list_one, frequency_list_two


In [None]:
# generate frequency(count) dataframe for both successful and unsuccessful categories
df_successful_frequency, df_unsuccessful_frequency = generate_frequency_list(df_successful, df_unsuccessful)

In [None]:
df_successful_frequency

In [None]:
df_unsuccessful_frequency

In [None]:
successful_unsuccessful_plot(model, list(df_successful_frequency[0]), list(df_unsuccessful_frequency[0]))

In [None]:
# initialize weight dataframe with keys
keys = list(model.index_to_key)
df_keys = pd.DataFrame({"keys": keys, 'successful_weights': 0, 'unsuccessful_weights': 0})
df_keys

In [None]:
# find frequency weights
def find_weights(keys_dataframe, df_frequency):
    key_index = []
    weight_list = []
    for i in range(0, len(df_frequency[0])):
        index = keys_dataframe.index[keys_dataframe['keys'] == df_frequency.iloc[i, 0]].tolist()
        key_index += index
        # sometimes, depends on the training set, the category might not be in the keys set
        # check if the list is empty
        if len(index) == 1:
            weight = [df_frequency.iloc[i, 1]]
            weight_list += weight
        elif len(index) == 0:
            print(f'{df_frequency.iloc[i, 0]} not in key set')
        else:
            print(f'{df_frequency.iloc[i, 0]} is repeated in key set')
    return key_index, weight_list

# found how many keys are in the key set
def key_in_set_ratio(key_index, df_frequency):
    ratio = len(key_index)/len(df_frequency[0])
    print(ratio)


In [None]:
successful_key_index, successful_weight_list = find_weights(df_keys, df_successful_frequency)
unsuccessful_key_index, unsuccessful_weight_list = find_weights(df_keys, df_unsuccessful_frequency)

In [None]:
# check key in set ratio
[key_in_set_ratio(successful_key_index, df_successful_frequency), key_in_set_ratio(unsuccessful_key_index, df_unsuccessful_frequency)]

In [None]:
# assign weights to keys dataframe
df_keys.iloc[successful_key_index, 1] = successful_weight_list
df_keys.iloc[unsuccessful_key_index, 2] = unsuccessful_weight_list
df_keys

In [None]:
def catagory_evaluation(word):
    """
    using knn, in this case k=(the number of successful & unsuccessful weights)
    the more positive the result, the more likely the success is
    you can enter any words, but the closer the word to the data you train, the most accurate the result
    """
    try:
        value_list = list(model.most_similar(word, topn=None))
        # score is given by the relative closeness to successful minus closeness to unsuccessful
        score = (sum(value_list * df_keys.successful_weights) / sum(df_keys.successful_weights)
                 - sum(value_list * df_keys.unsuccessful_weights) / sum(df_keys.unsuccessful_weights))
    except:
        # if the entry is not in the key set, return 0
        score = 0
        print(f'{word} not in key set')
    return score

In [None]:
[catagory_evaluation('saas'), catagory_evaluation('satellite'),catagory_evaluation('cow')]

In [None]:
[catagory_evaluation('mit'), catagory_evaluation('stanford'), catagory_evaluation('caltech')]

In [None]:
[catagory_evaluation('california'),catagory_evaluation('hawaii'), catagory_evaluation('newyork')]

In [None]:
def top_n_average_calculation(df_frequency, n):
    rate_list = []
    for catagory in df_frequency[0][0:n]:
        rate = catagory_evaluation(catagory)
        rate_list += [rate]
    average = sum(rate_list) / len(rate_list)
    return average

def fact_check(df_frequency, top_entry_number):
    """check if the mean score of successful is bigger than that of unsuccessful"""
    average = top_n_average_calculation(df_frequency, top_entry_number)
    print(f"top {top_entry_number} category mean score: {average}")


In [None]:
fact_check(df_successful_frequency, 15)

In [None]:
fact_check(df_unsuccessful_frequency, 15)