In [1]:
import numpy as np
import pandas as pd

# VARS

In [2]:
PATH_TO_DATA_FILE = "high_diamond_ranked_10min.csv"
CLASS_COLNAME = "blueWins"
OMIT_FEATURES = ["gameId"]
TRAIN_SIZE = 0.9  # Ratio train:data
np.random.seed(0)  # Set random seed

# FUNS

In [3]:
# GLOBAL FUNS

def likelihood(value, mu, sigma):
    """Probability density function. Function image: https://wikimedia.org/api/rest_v1/media/math/render/svg/c9167a4f19898b676d4d1831530a8ff1246d33ab

    Args:
        value (float): Value to calcualte likelihood from normal distribution
        mu (float): Mean of distribution 
        sigma (float): Standard deviation of distribution 

    Returns:
        float: Likelihood of value given N(mu, sigma)
    """
    a = 2 * np.pi * sigma ** 2
    b = (value - mu) / sigma
    return 1 / np.sqrt(a) * np.exp(-0.5 * b ** 2)


def validate_result(guess, index, test):
    # compares guess with actual value from data
    if guess == test['blueWins'][index]: return 1
    else: return 0

# NB OBJECT SPECS

In [12]:
class NB_model:
    # Notation:
    # Y = (y1,y2,..,yk) classes
    # X = (x1,x2,...,xn) features

    def __init__(self):
        self.distributions_XGivenY = []  # P(X|Y)
        self.P_Y = []  # P(Y)

    def train(self, traindata, classname, classes, features):
        for classtype in classes:
            class_data = traindata[traindata[classname] == classtype]  # grab data for yj
            self.P_Y.append(len(class_data) / len(traindata))  # get P(yj)
            class_feature_distributions = []  # temp variable to store feature distributions for yj

            for feature in features:
                feature_data = class_data[feature]  # grab data for xi|yj
                class_feature_distributions.append([np.mean(feature_data), np.std(feature_data)])

            # each class gets their own array for feature distributions
            self.distributions_XGivenY.append(class_feature_distributions)

        # convert feature|class distribution data to pd df
        # this is in form: columns=classes, rows=features
        self.distributions_XGivenY = pd.DataFrame(self.distributions_XGivenY, columns=[features]).transpose()

    def predict():
        #TODO: PREDICTION
        prediction = 0
        return prediction

    def test_accuracy(testdata):
        validation_array = []
        np.mean(validation_array)

# PROGRAM START

In [13]:
df = pd.read_csv("high_diamond_ranked_10min.csv")

# GET CLASSES FEATURES
classes = np.sort(df['blueWins'].unique().tolist())
features = [feature for feature in df.columns.values.tolist() if feature not in ('blueWins', 'gameId')]

# SPLIT TEST TRAIN
np.random.seed(0)
mask = np.random.rand(len(df)) < 0.9
train = df[mask]
test = df[~mask]

# TRAIN MODEL
mymodel = NB_model()
mymodel.train(train, CLASS_COLNAME, classes, features)
mymodel.distributions_XGivenY

Unnamed: 0,0,1
blueWardsPlaced,"[22.379766711529832, 18.55817943636256]","[22.33348365276212, 17.589402990430433]"
blueWardsDestroyed,"[2.71624046657694, 2.1962917986737454]","[2.9298759864712514, 2.155990654088501]"
blueFirstBlood,"[0.4037685060565276, 0.4906521166502925]","[0.6081172491544532, 0.48817072877762496]"
blueKills,"[5.183266038582324, 2.616220567875615]","[7.195039458850056, 3.054200016045882]"
blueDeaths,"[7.135935397039031, 2.92272324375351]","[5.1521984216459975, 2.581534727250838]"
blueAssists,"[5.5459847465231045, 3.5936822438487916]","[7.76076662908681, 4.2327752520644815]"
blueEliteMonsters,"[0.411395244504262, 0.5638864691651136]","[0.6879368658399098, 0.653859346969575]"
blueDragons,"[0.2613279497532526, 0.43935822790864204]","[0.46516347237880495, 0.49878493997243023]"
blueHeralds,"[0.1500672947510094, 0.3571373710452633]","[0.22277339346110483, 0.4161074484155716]"
blueTowersDestroyed,"[0.02265589950650516, 0.15030383028203634]","[0.07756482525366404, 0.30305386653721955]"


In [19]:
validation_array = []  # store if model was correct or incorrect
validation_score = 0  # mean of validation array


def likelihood(value, mu, sigma):
    # probability density function
    # image: https://wikimedia.org/api/rest_v1/media/math/render/svg/c9167a4f19898b676d4d1831530a8ff1246d33ab
    a = 2 * np.pi * sigma ** 2
    b = (value - mu) / sigma
    return 1 / np.sqrt(a) * np.exp(-0.5 * b ** 2)


def validate_result(guess, index):
    # compares guess with actual value from data
    if guess == test['blueWins'][index]: return 1
    else: return 0


for ind in test.index:

    # initiate
    P_yjGivenX = np.NINF  # P(yj|X)
    prediction = 0  # argmax(j=1,k) [ P(yj) * PROD(i=1,n)(P(xi|yj) ]

    for classtype in classes:

        class_distribution_data = mymodel.distributions_XGivenY[classtype]  # grab feature distribution data for yj

        P_xiGivenyj_array = []  # temp array to store log(P(xi|yj)) values

        for feature in features:
            feature_data = class_distribution_data[feature]  # grab xi distribution data
            feature_mu, feature_sigma = feature_data
            feature_value = test[feature][ind]  # grab instance xi data

            # calculate log(P(xi|yj))
            P_xiGivenyj_array.append(np.log(likelihood(feature_value, feature_mu, feature_sigma)))

        # calculate log( P(yj)*PROD(i=1,n)(P(xi|yj) )
        temp_P = np.log(mymodel.P_Y[classtype]) + np.sum(P_xiGivenyj_array)

        if temp_P > P_yjGivenX:
            # argmax(j=1,k) [ P(yj) * PROD(i=1,n)(P(xi|yj) ]
            P_yjGivenX = np.log(mymodel.P_Y[classtype]) + np.sum(P_xiGivenyj_array)
            prediction = classtype

    validation_array.append(validate_result(prediction, ind))  # model validation

validation_score = np.mean(validation_array)  # prediction success rate. with np.rand seed = 0, success rate is ~75%
print(validation_score)

0.7454361054766734
