In [1]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import names
from nltk.metrics.scores import precision, recall
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedShuffleSplit
from matplotlib import pyplot as plt
import seaborn as sns

### Feature extractor

In [2]:
def team_features(team):
    info = {}
    for i in range(1,17):
        info[f"seed: {i}"] = False
        if int(team.SEED) == i:
          info[f"seed: {i}"] = True
    info["Win/Loss ratio"] = team.W / (team.G - team.W)
    info["2-point rate"] = team["2P_O"]
    info["3-point rate"] = team["3P_O"]

    return info

In [3]:
df = pd.read_csv("cbb.csv")
df = df.dropna()
df['SEED'] = df['SEED'].astype(str).str.replace('$', '',regex=True).astype('float')
df['3P_O'] = df['3P_O'].astype(str).str.replace('$', '',regex=True).astype('float')
df['2P_O'] = df['2P_O'].astype(str).str.replace('$', '',regex=True).astype('float')
df['W'] = df['W'].astype(str).str.replace('$', '',regex=True).astype('float')

In [4]:
data = []
for i in range(df.TEAM.size):
    curr = df.iloc[i]
    if curr.POSTSEASON != "NA" and curr.SEED != "NA":
      oneWin = "First round out" if curr.POSTSEASON in ["R68", "R64"] else "Past first round"
      data.append((team_features(curr), oneWin))

In [5]:
def get_classifier(test_size, random_state):
    train_set, test_set = train_test_split(data, test_size=test_size, random_state=random_state)
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    return [train_set, test_set, classifier]

### Naive Bayes classification

In [None]:
final_value  = [0 ,0 ,0, 0, 0]
for num in range(1, 10):
    test_size = num / 10.0
    for random_state in range(1, 50):
        train_set, test_set, classifier = get_classifier(test_size, random_state)
        y_true = list(list(zip(*test_set))[1])
        test_features = list(list(zip(*test_set))[0])
        y_predict = [classifier.classify(features) for features in test_features]
        (p,r,f,s) = precision_recall_fscore_support(y_true, y_predict)
        if (p[0] > final_value[0]):
            final_value[0] = p[0]
            final_value[1] = test_size
            final_value[2] = random_state
            final_value[3] = (p,r,f,s)
            final_value[4] = classifier

In [None]:
print("best precision_recall_fscore_support value (p, r, f, s) with test size of " + str(final_value[1]) + " random state of " + str(final_value[2]) + " is " + str(final_value[3]) )

final_value[4].show_most_informative_features(15)

### Gaussian Naive Bayes classification

In [None]:
final_value1  = [0 ,0 ,0, 0]
for num in range(1, 10):
    test_size = num / 10.0
    for random_state in range(1, 50):
        train_set, test_set = train_test_split(data, test_size=0.2, random_state=36)

        # Modify the feature matrix to have 2D array shape
        X_train = [list(features.values()) for features, label in train_set]
        y_train = [label for features, label in train_set]
        X_test = [list(features.values()) for features, label in test_set]
        y_test = [label for features, label in test_set]

        clf = GaussianNB()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        p,r,f,s = precision_recall_fscore_support(y_test, y_pred)
        if (p[0] > final_value1[0]):
            final_value1[0] = p[0]
            final_value1[1] = test_size
            final_value1[2] = random_state
            final_value1[3] = (p,r,f,s)

In [None]:
print("best precision_recall_fscore_support value with test size of " + str(final_value1[1]) + " random state of " + str(final_value1[2]) + " is " + str(final_value1[3]) )

### analysis techniques

In [None]:
df['win/loss'] = df.W / (df.G - df.W)
df1 = df[df['SEED'] == 16]
top_10 = df1.head(10)
top_10

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(15,15))

ax = plt.subplot(2,1,1)
sns.barplot(data=top_10, x="TEAM", y="3P_O")

ax = plt.subplot(2,1,2)
sns.barplot(data=top_10, x="TEAM", y="win/loss")

In [None]:
a = top_10.groupby('POSTSEASON').agg({'W':'count'}).reset_index()
b = top_10.groupby('YEAR').agg({'W':'count'}).reset_index()

fig, ax = plt.subplots(2, 1, figsize=(15,15))

ax = plt.subplot(2,1,1)
sns.barplot(data=a, x="POSTSEASON", y="W")

ax = plt.subplot(2,1,2)
sns.barplot(data=b, x="YEAR", y="W")

In [None]:
df2 = df[df['3P_O'] == 32.6]
top_10_2 = df2.head(10)
top_10_2

In [None]:
a1 = top_10_2.groupby('POSTSEASON').agg({'W':'count'}).reset_index()
b1 = top_10_2.groupby('YEAR').agg({'W':'count'}).reset_index()

fig, ax = plt.subplots(2, 1, figsize=(15,15))

ax = plt.subplot(2,1,1)
sns.barplot(data=a1, x="POSTSEASON", y="W")

ax = plt.subplot(2,1,2)
sns.barplot(data=b1, x="YEAR", y="W")