In [1]:
#import libraries and read in data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn import naive_bayes, ensemble
from collections import defaultdict

def read_dataset(template, start_idx, end_idx):
    frames = [ pd.read_json(f) for f in [template.format(i) for i in range(start_idx, end_idx+1)] ]
    return pd.concat(frames, ignore_index = True)

df = read_dataset("./dataset_{:02}.json", 0, 2)
y_df = df.pop('round_winner').map(lambda s: 0 if s == "CT" else 1)
y_df

0        0
1        0
2        0
3        0
4        0
        ..
15225    1
15226    1
15227    1
15228    1
15229    1
Name: round_winner, Length: 15230, dtype: int64

In [2]:
#process data to get number of players alive for each team
ct_counts = []
t_counts = []
y_train = []

#count number of alive players for each team and only keep when a team has a clear majority
for x in range(len(df["alive_players"])):
    ct_count = 0
    t_count = 0
    for c in df["alive_players"][x]:
        if c["team"] == "Terrorist":
            t_count += 1
        else:
            ct_count += 1
    if not  False: #(ct_count == 5 and t_count == 5):
        ct_counts.append(ct_count)  
        t_counts.append(t_count)
        y_train.append(y_df[x])

y_train = pd.Series(y_train)
x_train = pd.DataFrame()
x_train["t_alive"] = t_counts
x_train["ct_alive"] = ct_counts

x_test = x_train.iloc[int(len(x_train)*.9):int(len(x_train))].reset_index(drop=True)
y_test = y_train.iloc[int(len(y_train)*.9):int(len(y_train))].reset_index(drop=True)
x_train = x_train.iloc[0:int(len(x_train.values)*.9)]
y_train = y_train.iloc[0:int(len(y_train.values)*.9)]

In [3]:
class naive_baes:
    """This Naive Bayes Classifier only works with discrete classes and inputs"""
    
    def __init__(self):
        """Save probabilities and priors for predictions"""
        self.priors = []
        self.num_features = 0
        self.features = []
        self.classes = []
        self.probabilities = defaultdict(int)
    
    def train(self, x, y):
        """x: DataFrame, Y: Pandas Series"""
        assert len(x.values) == len(y.values)
        n = len(y)
        self.classes = list(set(y))
        print(self.classes)
        y = list(y)
        #computes the class priors
        self.priors = [0]* len(self.classes)
        for i in self.classes:
            self.priors[i] = y.count(i)/len(y)

        #computes the probabilities of x|y
        self.features = list(x.keys())
        self.num_features = len(self.features)
        for i in range(self.num_features):
            vals = list(set(x[x.keys()[i]]))
            for j in range(len(vals)):
                for k in self.classes:
                    self.probabilities[self.features[i],vals[j],k] = sum([1 for z in range(len(x.values)) if (y[z] ==k) and (x[self.features[i]][z] == vals[j])])/sum([1 for z in range(len(x[self.features[i]])) if (y[z]==k)])    
                
                    
    def predict(self, x):
        """input data as Pandas DataFrame"""
        result = [None] * len(x.values)
        
        for i in range(len(x.values)):
            max_c = self.classes[0]
            max_c_prob, current =  -100000000, -100000000
            for c in self.classes:
                current = np.log(self.priors[c])

                for j in range(self.num_features):
                    current += np.log(self.probabilities[x.keys()[j],x[x.keys()[j]][i],c])             
                if current > max_c_prob:
                    max_c = c
                    max_c_prob = current
            result[i] = max_c
    
        return result
        

In [4]:
def accuracy(y1, y2):
    print(y1[0])
    print(y2[0])
    assert len(y1) == len(y2)
    total = 0
    for i in range(len(y1)):
        if y1[i] == y2[i]:
            total +=1
    return total/len(y1)

In [5]:
mod = naive_baes()
mod.train(x_train, y_train)

[0, 1]


In [6]:
result = mod.predict(x_test)
accuracy(result, y_test)
# print(mod.predict(x_train))

1
1


0.808641975308642

In [13]:
mod2 = ensemble.GradientBoostingClassifier()
# mod2 = naive_bayes.MultinomialNB()
# mod2 = tree.DecisionTreeClassifier()
mod2.fit(x_train, y_train)

GradientBoostingClassifier()

In [14]:
r = mod2.predict(x_test)
accuracy(r, y_test)

1
1


0.808641975308642