In [1]:
import pandas as pd
import numpy as np
import pickle
#import modeling function and libraries from .py file along
from functions import *
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.decomposition import PCA

## Pistol Round Model
First checking to see if a model can be created for both rounds 1 and 13 together

In [2]:
round_p = pd.read_csv('../data/round_p.csv', index_col=0)
X = round_p.drop(['team1win','map'], axis=1)
y = round_p['team1win']

In [3]:
pistol = pipeline(X,y)

baseline: 0.5647113594040968

0: Logistic Regression Score: 0.5031602088485848

1: XGBoost Score: 0.580241824677109

2: Random Forest Score: 0.5489145369607035

3: KNN Score: 0.5075570211596593

4: Decision Tree Score: 0.544517724649629

5: Gradient Boosting Score: 0.5917834569936796

6: AdaBoost Score: 0.5482275350370981

7: Gaussian Naive Bayes Score: 0.5127782357790602

8: Quadratic Discriminant Analysis Score: 0.5145644407804342


Gradient Boosting (.59) and XGBoost (.57) beat the baseline (.56).

#### Pistol Round Classification Report 

In [4]:
print('XGBoost:')
print(classification_report(pistol[10], pistol[1].predict(pistol[9])))
print()
print('Gradient Boosting')
print(classification_report(pistol[10], pistol[5].predict(pistol[9])))

XGBoost:
              precision    recall  f1-score   support

           0       0.64      0.38      0.47      3639
           1       0.56      0.79      0.65      3639

    accuracy                           0.58      7278
   macro avg       0.60      0.58      0.56      7278
weighted avg       0.60      0.58      0.56      7278


Gradient Boosting
              precision    recall  f1-score   support

           0       0.73      0.29      0.42      3639
           1       0.56      0.89      0.69      3639

    accuracy                           0.59      7278
   macro avg       0.64      0.59      0.55      7278
weighted avg       0.64      0.59      0.55      7278



Both models have low recall for 0. As a result I split the model into two seperate models for rounds 1 and 13.

## Round 1 Model

In [5]:
round_1 = pd.read_csv('../data/round_1.csv', index_col=0)
X = round_1.drop(['team1win','map'], axis=1)
y = round_1['team1win']

#### Model Scores

In [6]:
round_1 = pipeline(X,y)

baseline: 0.5592799503414029

0: Logistic Regression Score: 0.49528301886792453

1: XGBoost Score: 0.5552164261931187

2: Random Forest Score: 0.5529966703662597

3: KNN Score: 0.5163706992230854

4: Decision Tree Score: 0.5518867924528302

5: Gradient Boosting Score: 0.5665926748057714

6: AdaBoost Score: 0.5288568257491676

7: Gaussian Naive Bayes Score: 0.5099889012208657

8: Quadratic Discriminant Analysis Score: 0.5061043285238623




Only Gradient Boosting (.563) beat the baseline (.559). Other models which performed slightly worse than the baseline will also be investigated.

#### Classification Report

In [7]:
print('XGBoost:')
print(classification_report(round_1[10], round_1[1].predict(round_1[9])))
print()
print('Random Forest:')
print(classification_report(round_1[10], round_1[2].predict(round_1[9])))
print()
print('Gradient Boosting')
print(classification_report(round_1[10], round_1[5].predict(round_1[9])))

XGBoost:
              precision    recall  f1-score   support

           0       0.56      0.48      0.52      1802
           1       0.55      0.63      0.59      1802

    accuracy                           0.56      3604
   macro avg       0.56      0.56      0.55      3604
weighted avg       0.56      0.56      0.55      3604


Random Forest:
              precision    recall  f1-score   support

           0       0.56      0.53      0.54      1802
           1       0.55      0.57      0.56      1802

    accuracy                           0.55      3604
   macro avg       0.55      0.55      0.55      3604
weighted avg       0.55      0.55      0.55      3604


Gradient Boosting
              precision    recall  f1-score   support

           0       0.59      0.45      0.51      1802
           1       0.55      0.69      0.61      1802

    accuracy                           0.57      3604
   macro avg       0.57      0.57      0.56      3604
weighted avg       0.57      0

XGBoost has more balanced precision and recall for 0 and 1 which is better for the purpose of the models. 

### PCA for Round 1

In [8]:
pca = PCA()
pca.fit(X)

np.cumsum(pca.explained_variance_ratio_)

array([0.99994494, 0.99994925, 0.99995302, 0.99995596, 0.99995883,
       0.99996164, 0.99996416, 0.9999666 , 0.99996878, 0.99997091,
       0.99997292, 0.99997477, 0.99997654, 0.99997817, 0.99997979,
       0.99998133, 0.99998284, 0.9999843 , 0.99998572, 0.99998707,
       0.99998833, 0.99998954, 0.99999072, 0.99999185, 0.99999291,
       0.99999395, 0.99999485, 0.99999565, 0.99999641, 0.99999712,
       0.99999776, 0.99999827, 0.99999874, 0.99999912, 0.99999945,
       0.99999967, 0.99999984, 0.99999997, 1.        , 1.        ])

In [9]:
len(pca.explained_variance_ratio_)

40

#### Model Scores

In [10]:
pca_1 = PCA(n_components=39, random_state=42)
X_pca = pca_1.fit_transform(X)

round_1_pca = pipeline(X_pca,y)

baseline: 0.5592799503414029

0: Logistic Regression Score: 0.5008324084350722

1: XGBoost Score: 0.5563263041065483

2: Random Forest Score: 0.5335738068812431

3: KNN Score: 0.5258046614872364

4: Decision Tree Score: 0.5169256381798002

5: Gradient Boosting Score: 0.5341287458379578

6: AdaBoost Score: 0.5246947835738068

7: Gaussian Naive Bayes Score: 0.5047169811320755

8: Quadratic Discriminant Analysis Score: 0.5088790233074362


XGBoost after PCA (.56) gives a better accuracy than the normal XGBoost Model. To confirm the classification report is looked at

In [11]:
print('XGBoost PCA:')
print(classification_report(round_1[10], round_1_pca[1].predict(round_1_pca[9])))

XGBoost PCA:
              precision    recall  f1-score   support

           0       0.56      0.57      0.56      1802
           1       0.56      0.55      0.55      1802

    accuracy                           0.56      3604
   macro avg       0.56      0.56      0.56      3604
weighted avg       0.56      0.56      0.56      3604



The classification report confirms that XGBoost after PCA gives us a slightly better model. For the purpose of this project, the non PCA XGBoost model (.55) is chosen since we can extract feature importance.

### Feature Importance

In [12]:
round_1_feature = pd.DataFrame(index=X.columns, data = round_1[1].feature_importances_).sort_values(by=0, ascending=False)

round_1_feature.columns = ['Feature Importance']

round_1_feature.head()

Unnamed: 0,Feature Importance
bankdiff,0.045099
team1_only_sage,0.04093
team2_only_jett,0.038806
team1_only_raze,0.03533
team2_only_kayo,0.034369


## Round 13 Model

In [13]:
round_13 = pd.read_csv('../data/round_13.csv', index_col=0)

X = round_13.drop(['team1win','map'], axis=1)
y = round_13['team1win']

In [14]:
round_13 = pipeline(X,y)

baseline: 0.5704531346989448

0: Logistic Regression Score: 0.516050054406964

1: XGBoost Score: 0.5924918389553863

2: Random Forest Score: 0.5753536452665942

3: KNN Score: 0.5111534276387377

4: Decision Tree Score: 0.5636561479869423

5: Gradient Boosting Score: 0.6009249183895539

6: AdaBoost Score: 0.5620239390642002

7: Gaussian Naive Bayes Score: 0.5182263329706203

8: Quadratic Discriminant Analysis Score: 0.5263873775843307


#### Classification Report

In [15]:
print('XGBoost:')
print(classification_report(round_13[10], round_13[1].predict(round_13[9])))
print()
print('Random Forests:')
print(classification_report(round_13[10], round_13[2].predict(round_13[9])))
print()
print('Gradient Boosting')
print(classification_report(round_13[10], round_13[5].predict(round_13[9])))

XGBoost:
              precision    recall  f1-score   support

           0       0.64      0.42      0.51      1838
           1       0.57      0.76      0.65      1838

    accuracy                           0.59      3676
   macro avg       0.60      0.59      0.58      3676
weighted avg       0.60      0.59      0.58      3676


Random Forests:
              precision    recall  f1-score   support

           0       0.59      0.50      0.54      1838
           1       0.57      0.65      0.60      1838

    accuracy                           0.58      3676
   macro avg       0.58      0.58      0.57      3676
weighted avg       0.58      0.58      0.57      3676


Gradient Boosting
              precision    recall  f1-score   support

           0       0.70      0.36      0.47      1838
           1       0.57      0.85      0.68      1838

    accuracy                           0.60      3676
   macro avg       0.63      0.60      0.58      3676
weighted avg       0.63      

XGBoost (.59), Gradient Boosting (.60) and Random Forest (.58) all have higher accuracies than the Baseline (.57).  While XGBoost and Gradient Boost have  higher accuracies than Random Forests, Random Forests is more balanced more false positives so it is chosen as a result.

### PCA

In [16]:
pca = PCA()
pca.fit(X)

np.cumsum(pca.explained_variance_ratio_)

array([0.99946132, 0.9999342 , 0.99993934, 0.99994384, 0.99994737,
       0.99995079, 0.99995415, 0.99995716, 0.99996008, 0.99996269,
       0.99996523, 0.99996763, 0.99996983, 0.99997195, 0.9999739 ,
       0.99997584, 0.99997768, 0.99997948, 0.99998122, 0.99998293,
       0.99998453, 0.99998604, 0.99998749, 0.9999889 , 0.99999025,
       0.99999152, 0.99999276, 0.99999383, 0.9999948 , 0.99999571,
       0.99999655, 0.99999731, 0.99999793, 0.99999849, 0.99999894,
       0.99999934, 0.9999996 , 0.99999981, 0.99999996, 1.        ])

In [17]:
len(pca.explained_variance_ratio_)

40

#### PCA Model Scores

In [18]:
pca_13 = PCA(n_components=40, random_state=42)
X_pca = pca_13.fit_transform(X)

round_13_pca = pipeline(X_pca,y)

baseline: 0.5704531346989448

0: Logistic Regression Score: 0.51550598476605

1: XGBoost Score: 0.5824265505984766

2: Random Forest Score: 0.5424374319912949

3: KNN Score: 0.5193144722524483

4: Decision Tree Score: 0.5410772578890098

5: Gradient Boosting Score: 0.5854189336235038

6: AdaBoost Score: 0.5359085963003264

7: Gaussian Naive Bayes Score: 0.5187704026115343

8: Quadratic Discriminant Analysis Score: 0.5266594124047879


None of the PCA models perform better than the previous models, as a result the initial Random Forests used.

### Feature Importance

In [19]:
round_13_feature = pd.DataFrame(index=X.columns, data = round_13[2].feature_importances_).sort_values(by=0, ascending=False)

round_13_feature.columns = ['Feature Importance']

round_13_feature.head(10)

Unnamed: 0,Feature Importance
rounddiff,0.227999
bankdiff,0.187156
team1_only_sage,0.022297
team2_only_jett,0.021708
team1_only_jett,0.021685
team2_only_raze,0.020985
team1_only_killjoy,0.020707
team2_only_skye,0.020541
team2_only_cypher,0.020215
team2_only_killjoy,0.019908


## Round 2 Model

In [20]:
round_2 = pd.read_csv('../data/round_2.csv', index_col=0)

X = round_2.drop(['team1win','map'], axis=1)
y = round_2['team1win']

In [21]:
round_2 = pipeline(X,y)

baseline: 0.5712069500465405

0: Logistic Regression Score: 0.8719445953286258

1: XGBoost Score: 0.8837588267246062

2: Random Forest Score: 0.8704508419337317

3: KNN Score: 0.846279196089082

4: Decision Tree Score: 0.8032319391634981

5: Gradient Boosting Score: 0.8814502987506789

6: AdaBoost Score: 0.8773764258555133

7: Gaussian Naive Bayes Score: 0.850624660510592

8: Quadratic Discriminant Analysis Score: 0.5118142313959805




#### Clasification Report

In [22]:
print('Baseline:')
print(classification_report(y,X['team1winlag1']))
print('XGBoost:')
print(classification_report(round_2[10], round_2[1].predict(round_2[9])))
print()
print('Gradient Boosting')
print(classification_report(round_2[10], round_2[5].predict(round_2[9])))

Baseline:
              precision    recall  f1-score   support

           0       0.85      0.86      0.86     13818
           1       0.90      0.89      0.89     18408

    accuracy                           0.88     32226
   macro avg       0.87      0.87      0.87     32226
weighted avg       0.88      0.88      0.88     32226

XGBoost:
              precision    recall  f1-score   support

           0       0.89      0.88      0.88      3682
           1       0.88      0.89      0.88      3682

    accuracy                           0.88      7364
   macro avg       0.88      0.88      0.88      7364
weighted avg       0.88      0.88      0.88      7364


Gradient Boosting
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      3682
           1       0.88      0.88      0.88      3682

    accuracy                           0.88      7364
   macro avg       0.88      0.88      0.88      7364
weighted avg       0.88      0.88   

XGBoost (.88) and Gradient Boosting (.88) both perform the same as Win Lag 1 baseline (.88). Gradient Boosting has slightly worse recall for 1 when compared to XGBoost. While the accuracy of XGBoost is the same as Baseline, it is slightly more balanced for precision and recall for 0 and 1. As a result XGBoost is chosen.

The models all performed similarly, with the XGBoost model performing slightly better than the other two. The XGBoost model should be used because it is faster to train and has a higher accuracy.

XGBoost should be used as the preferred model as it outperforms the baseline model and the other two models tested. It has a higher precision, recall, and f1-score

### PCA for round 2

In [23]:
pca = PCA()
pca.fit(X)

np.cumsum(pca.explained_variance_ratio_)

array([0.99855538, 0.99999954, 0.9999999 , 0.99999991, 0.99999992,
       0.99999992, 0.99999993, 0.99999993, 0.99999993, 0.99999994,
       0.99999994, 0.99999995, 0.99999995, 0.99999995, 0.99999996,
       0.99999996, 0.99999996, 0.99999996, 0.99999997, 0.99999997,
       0.99999997, 0.99999997, 0.99999998, 0.99999998, 0.99999998,
       0.99999998, 0.99999998, 0.99999998, 0.99999999, 0.99999999,
       0.99999999, 0.99999999, 0.99999999, 0.99999999, 0.99999999,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ])

In [24]:
len(pca.explained_variance_ratio_)

50

In [25]:
pca_2 = PCA(n_components=36, random_state=42)
X_pca = pca_2.fit_transform(X)

round_1_pca = pipeline(X_pca,y)

baseline: 0.5712069500465405

0: Logistic Regression Score: 0.8738457360130364

1: XGBoost Score: 0.8699076588810429

2: Random Forest Score: 0.8625746876697447

3: KNN Score: 0.7730852797392721

4: Decision Tree Score: 0.7903313416621401

5: Gradient Boosting Score: 0.8734383487235198

6: AdaBoost Score: 0.8703150461705594

7: Gaussian Naive Bayes Score: 0.8675991309071157

8: Quadratic Discriminant Analysis Score: 0.8716730038022814


All the PCA models perform worse than the initial XGBoost model.

### Feature Importance

In [26]:
round_2_feature = pd.DataFrame(index=X.columns, data = round_2[1].feature_importances_).sort_values(by=0, ascending=False)
round_2_feature.columns = ['Feature Importance']
round_2_feature.head(10)

Unnamed: 0,Feature Importance
team1winlag1,0.764004
team2_eco,0.071886
team1_eco,0.024643
team1_semi_buy,0.021836
team2_semi_buy,0.006184
bankdifflag1,0.004231
team1_only_skye,0.003688
bankdiff,0.003596
team1_only_sage,0.003484
team1_only_raze,0.003365


## Round 3 Model

In [27]:
round_3 = pd.read_csv('../data/round_3.csv', index_col=0)

X = round_3.drop(['team1win','map'], axis=1)
y = round_3['team1win']

#### Baseline for Round Lag 1

In [28]:
round_3[['team1win','team1winlag1']].dropna().groupby(by=round_3['team1winlag1']).mean()['team1win'][1]

0.5396176373079898

The baseline taking the majority class for y is higher than the Lag 1 baseline so it is used instead.

#### Models

In [29]:
round_3 = pipeline(X,y)

baseline: 0.5506041110936765

0: Logistic Regression Score: 0.658734682245654

1: XGBoost Score: 0.6749786263892847

2: Random Forest Score: 0.6550299230550014

3: KNN Score: 0.5909090909090909

4: Decision Tree Score: 0.5762325448845825

5: Gradient Boosting Score: 0.6822456540324879

6: AdaBoost Score: 0.6665716728412653

7: Gaussian Naive Bayes Score: 0.6477628954117982

8: Quadratic Discriminant Analysis Score: 0.5786548874323169




Both XGBoost (.68) and Gradient Boosting (.68) have higher accuracies than the baseline (.55). Their classification reports are looked at to determine the better model. 

#### Classification Report

In [30]:
print('Baseline(Lag 1):')
print(classification_report(y,X['team1winlag1']))
print('XGBoost:')
print(classification_report(round_3[10], round_3[1].predict(round_3[9])))
print()
print('Gradient Boosting')
print(classification_report(round_3[10], round_3[5].predict(round_3[9])))

Baseline(Lag 1):
              precision    recall  f1-score   support

           0       0.43      0.42      0.43     14317
           1       0.54      0.56      0.55     17545

    accuracy                           0.49     31862
   macro avg       0.49      0.49      0.49     31862
weighted avg       0.49      0.49      0.49     31862

XGBoost:
              precision    recall  f1-score   support

           0       0.68      0.66      0.67      3509
           1       0.67      0.69      0.68      3509

    accuracy                           0.67      7018
   macro avg       0.68      0.67      0.67      7018
weighted avg       0.68      0.67      0.67      7018


Gradient Boosting
              precision    recall  f1-score   support

           0       0.67      0.71      0.69      3509
           1       0.69      0.65      0.67      3509

    accuracy                           0.68      7018
   macro avg       0.68      0.68      0.68      7018
weighted avg       0.68      

XGBoost is chosen over Gradient Boosting since its precision and recall for 0 and 1 are more balanced than Gradient Boosting. 

### PCA for round 3

In [31]:
pca = PCA()
pca.fit(X)

np.cumsum(pca.explained_variance_ratio_)

array([0.94403161, 0.99965416, 0.9999999 , 0.99999997, 0.99999998,
       0.99999998, 0.99999998, 0.99999998, 0.99999998, 0.99999998,
       0.99999998, 0.99999998, 0.99999999, 0.99999999, 0.99999999,
       0.99999999, 0.99999999, 0.99999999, 0.99999999, 0.99999999,
       0.99999999, 0.99999999, 0.99999999, 0.99999999, 0.99999999,
       0.99999999, 0.99999999, 0.99999999, 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        ])

In [32]:
len(pca.explained_variance_ratio_)

52

In [33]:
pca_3 = PCA(n_components=29, random_state=42)
X_pca = pca_3.fit_transform(X)

round_1_pca = pipeline(X_pca,y)

baseline: 0.5506041110936765

0: Logistic Regression Score: 0.6595896266742661

1: XGBoost Score: 0.6598746081504702

2: Random Forest Score: 0.622399544029638

3: KNN Score: 0.5991735537190083

4: Decision Tree Score: 0.5635508691935024

5: Gradient Boosting Score: 0.6654317469364491

6: AdaBoost Score: 0.6500427472214306

7: Gaussian Naive Bayes Score: 0.6551724137931034

8: Quadratic Discriminant Analysis Score: 0.6407808492447991


All the PCA models perform worse than the initial XGBoost model.

### Feature Importance

In [34]:
round_3_feature = pd.DataFrame(index=X.columns, data = round_3[1].feature_importances_).sort_values(by=0, ascending=False)
round_3_feature.columns = ['Feature Importance']
round_3_feature.head(10)

Unnamed: 0,Feature Importance
team1winlag1,0.275882
team1winlag2,0.158763
team1_semi_buy,0.035289
team1_full_buy,0.034515
team2_full_buy,0.027533
team2_semi_eco,0.02292
team1_semi_eco,0.021127
team2_eco,0.019873
bankdifflag2,0.017308
team2_semi_buy,0.016765


## Round 4 Model

In [35]:
round_4 = pd.read_csv('../data/round_4.csv', index_col=0)

X = round_4.drop(['team1win','map'], axis=1)
y = round_4['team1win']

#### Model Scores

In [36]:
round_4 = pipeline(X,y)

baseline: 0.5775144138372837

0: Logistic Regression Score: 0.6347753743760399

1: XGBoost Score: 0.6831669439822518

2: Random Forest Score: 0.6683305601774819

3: KNN Score: 0.5901275651691625

4: Decision Tree Score: 0.6145313366611204

5: Gradient Boosting Score: 0.6859400998336106

6: AdaBoost Score: 0.6659733777038269

7: Gaussian Naive Bayes Score: 0.6285357737104825

8: Quadratic Discriminant Analysis Score: 0.620077648363838




XGBoost (.68) and Gradient (.69) perform better than the Time Lag 1 baseline (.60). Their classification reports are looked at to determine which model should be used

#### Classification Report

In [37]:
print('Baseline:')
print(classification_report(y,X['team1winlag1']))
print('XGBoost:')
print(classification_report(round_4[10], round_4[1].predict(round_4[9])))
print()
print('Gradient Boosting')
print(classification_report(round_4[10], round_4[5].predict(round_4[9])))

Baseline:
              precision    recall  f1-score   support

           0       0.53      0.57      0.55     13190
           1       0.67      0.63      0.65     18027

    accuracy                           0.60     31217
   macro avg       0.60      0.60      0.60     31217
weighted avg       0.61      0.60      0.61     31217

XGBoost:
              precision    recall  f1-score   support

           0       0.71      0.61      0.66      3606
           1       0.66      0.76      0.70      3606

    accuracy                           0.68      7212
   macro avg       0.69      0.68      0.68      7212
weighted avg       0.69      0.68      0.68      7212


Gradient Boosting
              precision    recall  f1-score   support

           0       0.72      0.61      0.66      3606
           1       0.66      0.77      0.71      3606

    accuracy                           0.69      7212
   macro avg       0.69      0.69      0.68      7212
weighted avg       0.69      0.69   

Both models have simmilar balancing between precisions and recall so Gradient Boosting is chosen since it has a higher accuracy score.

### PCA for round 4

In [38]:
pca = PCA()
pca.fit(X)

np.cumsum(pca.explained_variance_ratio_)

array([0.63489714, 0.96741416, 0.99977063, 0.99999994, 0.99999998,
       0.99999998, 0.99999998, 0.99999999, 0.99999999, 0.99999999,
       0.99999999, 0.99999999, 0.99999999, 0.99999999, 0.99999999,
       0.99999999, 0.99999999, 0.99999999, 0.99999999, 0.99999999,
       0.99999999, 0.99999999, 0.99999999, 0.99999999, 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        ])

In [39]:
len(pca.explained_variance_ratio_)

54

#### PCA Scores

In [40]:
pca_4 = PCA(n_components=36, random_state=42)
X_pca = pca_4.fit_transform(X)

round_1_pca = pipeline(X_pca,y)

baseline: 0.5775144138372837

0: Logistic Regression Score: 0.6368552412645591

1: XGBoost Score: 0.656128674431503

2: Random Forest Score: 0.6405990016638935

3: KNN Score: 0.5908208541320022

4: Decision Tree Score: 0.5696062118691071

5: Gradient Boosting Score: 0.6540488075429839

6: AdaBoost Score: 0.6407376594564614

7: Gaussian Naive Bayes Score: 0.6328341652800887

8: Quadratic Discriminant Analysis Score: 0.641846921797005


None of of the PCA models have higher accuracy so the initial Gradient Boosting model is chosen.

### Feature Importance

In [41]:
round_4_feature = pd.DataFrame(index=X.columns, data = round_4[5].feature_importances_).sort_values(by=0, ascending=False)
round_4_feature.columns = ['Feature Importance']
round_4_feature.head(10)

Unnamed: 0,Feature Importance
team2_full_buy,0.281066
bankdifflag3,0.147573
team1_full_buy,0.095344
team2_semi_eco,0.084091
team2_semi_buy,0.068451
rounddiff,0.059613
team1winlag1,0.055397
team1_semi_eco,0.053198
team1_semi_buy,0.034222
team1winlag3,0.022998


## Overtime Rounds Model

In [42]:
ot_rounds= pd.read_csv('../data/round_ot.csv', index_col=0)

X = ot_rounds.drop(['team1win','map'], axis=1)
y = ot_rounds['team1win']

In [43]:
ot_rounds = pipeline(X,y)

baseline: 0.584051724137931

0: Logistic Regression Score: 0.513530135301353

1: XGBoost Score: 0.6008610086100861

2: Random Forest Score: 0.5867158671586716

3: KNN Score: 0.515990159901599

4: Decision Tree Score: 0.541820418204182

5: Gradient Boosting Score: 0.5904059040590406

6: AdaBoost Score: 0.5645756457564576

7: Gaussian Naive Bayes Score: 0.4981549815498155

8: Quadratic Discriminant Analysis Score: 0.4981549815498155


XGBoost (.60) and Gradient Boosting (.59) have simmilar accuracies and are both above the baseline (.58).

### PCA for Overtime

In [44]:
pca = PCA()
pca.fit(X)

np.cumsum(pca.explained_variance_ratio_)

array([0.13923792, 0.20695447, 0.26430286, 0.31331388, 0.35966101,
       0.40358606, 0.4431776 , 0.48184371, 0.51678431, 0.55110993,
       0.58273126, 0.61233005, 0.64020331, 0.66618622, 0.6913782 ,
       0.71587452, 0.73919087, 0.76180976, 0.78359278, 0.80441086,
       0.82422546, 0.84330791, 0.8615512 , 0.87895079, 0.89461215,
       0.90958836, 0.92339657, 0.93493293, 0.94619237, 0.95688877,
       0.96755737, 0.97527466, 0.98207041, 0.98749338, 0.99185089,
       0.9952253 , 0.99753985, 0.9995185 , 1.        ])

In [45]:
len(pca.explained_variance_ratio_)

39

In [46]:
pca_ot = PCA(n_components=39, random_state=42)
X_pca = pca_ot.fit_transform(X)

round_1_pca = pipeline(X_pca,y)

baseline: 0.584051724137931

0: Logistic Regression Score: 0.522140221402214

1: XGBoost Score: 0.5842558425584256

2: Random Forest Score: 0.5781057810578106

3: KNN Score: 0.518450184501845

4: Decision Tree Score: 0.5313653136531366

5: Gradient Boosting Score: 0.5713407134071341

6: AdaBoost Score: 0.5615006150061501

7: Gaussian Naive Bayes Score: 0.5264452644526445

8: Quadratic Discriminant Analysis Score: 0.502460024600246


In [47]:
print('XGBoost:')
print(classification_report(ot_rounds[10], ot_rounds[1].predict(ot_rounds[9])))
print()
print('Gradient Boosting')
print(classification_report(ot_rounds[10], ot_rounds[5].predict(ot_rounds[9])))

XGBoost:
              precision    recall  f1-score   support

           0       0.61      0.56      0.59       813
           1       0.59      0.64      0.61       813

    accuracy                           0.60      1626
   macro avg       0.60      0.60      0.60      1626
weighted avg       0.60      0.60      0.60      1626


Gradient Boosting
              precision    recall  f1-score   support

           0       0.65      0.40      0.49       813
           1       0.57      0.78      0.66       813

    accuracy                           0.59      1626
   macro avg       0.61      0.59      0.57      1626
weighted avg       0.61      0.59      0.57      1626



XGBoost is chosen since all of its accuracy is higher and its precision and recall are more balanced for 0 and 1.

In [48]:
round_ot_feature = pd.DataFrame(index=X.columns, data = ot_rounds[1].feature_importances_).sort_values(by=0, ascending=False)
round_ot_feature.columns = ['Feature Importance']
round_ot_feature.head(10)

Unnamed: 0,Feature Importance
rounddiff,0.083379
team1_only_omen,0.041743
team1_only_killjoy,0.036296
team2_only_killjoy,0.033328
team1_only_reyna,0.03203
team1_only_raze,0.030697
team1_only_breach,0.030368
team1_only_cypher,0.03004
team1_only_sova,0.028799
team2_only_raze,0.028482


## Mid-Game Rounds Model

In [49]:
rounds= pd.read_csv('../data/rounds.csv', index_col=0)

X = rounds.drop(['team1win','map'], axis=1)
y = rounds['team1win']

#### Modeling

In [50]:
mid_game = pipeline(X,y)

baseline: 0.571417986020204

0: Logistic Regression Score: 0.6019667170953101

1: XGBoost Score: 0.6659390533823212

2: Random Forest Score: 0.6347957639939485

3: KNN Score: 0.5662200129673655

4: Decision Tree Score: 0.5851523665441971

5: Gradient Boosting Score: 0.6630862329803329

6: AdaBoost Score: 0.6229522368705425

7: Gaussian Naive Bayes Score: 0.5993300194510482





8: Quadratic Discriminant Analysis Score: 0.5869029608817808


In [51]:
print('Baseline:')
print(classification_report(y,X['team1winlag1']))
print('XGBoost:')
print(classification_report(mid_game[10], mid_game[1].predict(mid_game[9])))
print()
print('Gradient Boosting')
print(classification_report(mid_game[10], mid_game[5].predict(mid_game[9])))

Baseline:
              precision    recall  f1-score   support

           0       0.53      0.54      0.54     86762
           1       0.65      0.64      0.65    115672

    accuracy                           0.60    202434
   macro avg       0.59      0.59      0.59    202434
weighted avg       0.60      0.60      0.60    202434

XGBoost:
              precision    recall  f1-score   support

           0       0.74      0.51      0.60     23135
           1       0.63      0.83      0.71     23135

    accuracy                           0.67     46270
   macro avg       0.69      0.67      0.66     46270
weighted avg       0.69      0.67      0.66     46270


Gradient Boosting
              precision    recall  f1-score   support

           0       0.73      0.52      0.61     23135
           1       0.63      0.81      0.70     23135

    accuracy                           0.66     46270
   macro avg       0.68      0.66      0.66     46270
weighted avg       0.68      0.66   

XGBoost (.67) performs better than the baseline (.60) and Gradient Boosting (.66) so it is chosen.

### PCA for Mid-Game Rounds

In [52]:
pca = PCA()
pca.fit(X)

np.cumsum(pca.explained_variance_ratio_)

array([0.63857317, 0.80637996, 0.91434997, 0.99999998, 0.99999999,
       0.99999999, 0.99999999, 0.99999999, 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ])

In [53]:
len(pca.explained_variance_ratio_)

55

#### PCA Modeling

In [54]:
# pca_rounds = PCA(n_components=30, random_state=42)
# X_pca = pca_rounds.fit_transform(X)

In [55]:
# round_1_pca = pipeline(X_pca,y)

None of the PCA models perform better than the initial XGBoost model so it is chosen.

In [56]:
rounds_feature = pd.DataFrame(index=X.columns, data = mid_game[1].feature_importances_).sort_values(by=0, ascending=False)
rounds_feature.columns = ['Feature Importance']
rounds_feature.head(10)

Unnamed: 0,Feature Importance
team2_full_buy,0.210781
team1_full_buy,0.099001
team1winlag3,0.096463
team2_semi_buy,0.082458
team1winlag1,0.062545
team1winlag4,0.060533
team1winlag2,0.05633
team1_semi_buy,0.056209
rounddiff,0.035547
team2_semi_eco,0.027285


In every case the model performed simmilarly to the baseline or better with higher precision and recall statistics. Tuning these models may even increase their accuracies even more. These models show that valorant round winners can be predicted based on very few metrics. Adding a few more metrics or switching buy type to loadout value may provide even greater results. The possibility of also adding whether or not a team has ultimate abilities for the round may also improve accuracy as well. More investigation needs to be done in this matter. This modeling can also be seen as the begining of an even more useful model that shows round win probabilities througout each round rather than just at the begining.