# Loading data

In [1]:
import pandas as pd
df = pd.read_csv(r'C:\Users\zzg\Desktop\data.csv', sep = ',')

In [2]:
from sklearn import datasets
import numpy as np
y = df.iloc[:, 11]
y.head()

0    1
1    1
2    1
3    1
4    1
Name: Unnamed: 11, dtype: int64

In [3]:
X = df.iloc[:, [0,1,2,3,4,5,6,7,8,9]]
X.head()

Unnamed: 0,red team 1,red team 2,red team 3,red team 4,red team 5,blue team 1,blue team 2,blue team 3,blue team 4,blue team 5
0,96,69,21,5,119,74,34,80,31,63
1,97,120,22,101,125,64,107,72,10,96
2,21,84,127,112,86,80,111,64,69,122
3,8,31,96,48,60,3,138,130,73,30
4,124,32,107,64,137,54,96,10,82,114


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1, stratify=y)

# Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=0.01, random_state=42)
lr.fit(X_train, y_train)



LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [6]:
from sklearn.metrics import roc_curve, auc
from scipy import interp
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipe_lr = make_pipeline(StandardScaler(),
                        PCA(n_components=2),
                        LogisticRegression(penalty='l2', 
                                           random_state=1, 
                                           C=100.0))
from sklearn.model_selection import validation_curve


param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
train_scores, test_scores = validation_curve(
                estimator=pipe_lr, 
                X=X_train, 
                y=y_train, 
                param_name='logisticregression__C', 
                param_range=param_range,
                cv=10)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(param_range, train_mean, 
         color='blue', marker='o', 
         markersize=5, label='training accuracy')

plt.fill_between(param_range, train_mean + train_std,
                 train_mean - train_std, alpha=0.15,
                 color='blue')

plt.plot(param_range, test_mean, 
         color='green', linestyle='--', 
         marker='s', markersize=5, 
         label='validation accuracy')

plt.fill_between(param_range, 
                 test_mean + test_std,
                 test_mean - test_std, 
                 alpha=0.15, color='green')

plt.grid()
plt.xscale('log')
plt.legend(loc='lower right')
plt.xlabel('Parameter C')
plt.ylabel('Accuracy')
plt.ylim([0.4, 0.9])
plt.tight_layout()
plt.show()

NameError: name 'plt' is not defined

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve


pipe_lr = make_pipeline(StandardScaler(),
                        LogisticRegression(penalty='l2', random_state=1))

train_sizes, train_scores, test_scores =\
                learning_curve(estimator=pipe_lr,
                               X=X_train,
                               y=y_train,
                               train_sizes=np.linspace(0.1, 1.0, 10),
                               cv=10,
                               n_jobs=1)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean,
         color='blue', marker='o',
         markersize=5, label='training accuracy')

plt.fill_between(train_sizes,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.15, color='blue')

plt.plot(train_sizes, test_mean,
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='validation accuracy')

plt.fill_between(train_sizes,
                 test_mean + test_std,
                 test_mean - test_std,
                 alpha=0.15, color='green')

plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.4, 0.9])
plt.tight_layout()
plt.show()

# Evaluation

In [None]:
from sklearn.metrics import accuracy_score
y_pred = lr.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))


In [None]:
from sklearn.metrics import confusion_matrix

confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
print(confmat)


In [None]:
fig, ax = plt.subplots(figsize=(2.5, 2.5))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')

plt.xlabel('Predicted label')
plt.ylabel('True label')

plt.tight_layout()

plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
y_pred = lr.predict(X_test)
print('ROC AUC: %.3f' % roc_auc_score(y_test,lr.predict_proba(X_test)[:,1]))

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion='gini', 
                              max_depth=4, 
                              random_state=1)
tree.fit(X_train, y_train)

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve


train_sizes, train_scores, test_scores =\
                learning_curve(estimator=tree,
                               X=X_train,
                               y=y_train,
                               train_sizes=np.linspace(0.1, 1.0, 10),
                               cv=10,
                               n_jobs=1)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean,
         color='blue', marker='o',
         markersize=5, label='training accuracy')

plt.fill_between(train_sizes,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.15, color='blue')

plt.plot(train_sizes, test_mean,
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='validation accuracy')

plt.fill_between(train_sizes,
                 test_mean + test_std,
                 test_mean - test_std,
                 alpha=0.15, color='green')

plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.4, 0.9])
plt.tight_layout()
plt.show()

# Evaluation

In [None]:
from sklearn.metrics import accuracy_score
y_tree_pred = tree.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_tree_pred))


In [None]:
from sklearn.metrics import roc_auc_score
y_tree_pred = tree.predict(X_test)
print('ROC AUC: %.3f' % roc_auc_score(y_test,tree.predict_proba(X_test)[:,1]))

In [None]:
from sklearn.metrics import confusion_matrix

confmat = confusion_matrix(y_true=y_test, y_pred=y_tree_pred)
print(confmat)


In [None]:
fig, ax = plt.subplots(figsize=(2.5, 2.5))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')

plt.xlabel('Predicted label')
plt.ylabel('True label')

plt.tight_layout()

plt.show()

# Applying bagging to classify samples

In [7]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=None,
                              random_state=1)

bag = BaggingClassifier(base_estimator=tree,
                        n_estimators=500, 
                        max_samples=1.0, 
                        max_features=1.0, 
                        bootstrap=True, 
                        bootstrap_features=False, 
                        n_jobs=1, 
                        random_state=1)

In [None]:
from sklearn.metrics import accuracy_score

tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
      % (tree_train, tree_test))

bag = bag.fit(X_train, y_train)
y_train_pred = bag.predict(X_train)
y_test_pred = bag.predict(X_test)

bag_train = accuracy_score(y_train, y_train_pred) 
bag_test = accuracy_score(y_test, y_test_pred) 
print('Bagging train/test accuracies %.3f/%.3f'
      % (bag_train, bag_test))

Decision tree train/test accuracies 1.000/0.543


# Which hero to choose

For given number, for example [67,78,23,7,8],[4,89,62,73,i], I need to find the i to make the blue team win rate largest, use grammar:

In [12]:
df1 = pd.read_csv(r'C:\Users\zzg\Desktop\test data.csv', sep = ',')

In [13]:
df1

Unnamed: 0,red team 1,red team 2,red team 3,red team 4,red team 5,blue team 1,blue team 2,blue team 3,blue team 4,blue team 5
0,67,78,23,7,8,4,89,62,73,1
1,67,78,23,7,8,4,89,62,73,2
2,67,78,23,7,8,4,89,62,73,3
3,67,78,23,7,8,4,89,62,73,4
4,67,78,23,7,8,4,89,62,73,5
5,67,78,23,7,8,4,89,62,73,6
6,67,78,23,7,8,4,89,62,73,7
7,67,78,23,7,8,4,89,62,73,8
8,67,78,23,7,8,4,89,62,73,9
9,67,78,23,7,8,4,89,62,73,10


In [14]:
lr.predict_proba(df1)

array([[ 0.63025166,  0.36974834],
       [ 0.63024752,  0.36975248],
       [ 0.63024338,  0.36975662],
       [ 0.63023925,  0.36976075],
       [ 0.63023511,  0.36976489],
       [ 0.63023097,  0.36976903],
       [ 0.63022683,  0.36977317],
       [ 0.63022269,  0.36977731],
       [ 0.63021855,  0.36978145],
       [ 0.63021441,  0.36978559],
       [ 0.63021027,  0.36978973],
       [ 0.63020614,  0.36979386],
       [ 0.630202  ,  0.369798  ],
       [ 0.63019786,  0.36980214],
       [ 0.63019372,  0.36980628],
       [ 0.63018958,  0.36981042],
       [ 0.63018544,  0.36981456],
       [ 0.6301813 ,  0.3698187 ],
       [ 0.63017716,  0.36982284],
       [ 0.63017302,  0.36982698],
       [ 0.63016888,  0.36983112],
       [ 0.63016475,  0.36983525],
       [ 0.63016061,  0.36983939],
       [ 0.63015647,  0.36984353],
       [ 0.63015233,  0.36984767],
       [ 0.63014819,  0.36985181],
       [ 0.63014405,  0.36985595],
       [ 0.63013991,  0.36986009],
       [ 0.63013577,

we find that choosing one has the highest win rate, so we should choose Jax