# Connect to Azure ML workspace

In [1]:
workspace="league-ws"
subscription_id="79451499-b2c0-4513-8dea-ef7f37173fbb"
resource_grp="league"

experiment_name = "league_predict_temp"
model_name = "leaguepredict.mml" # in case you want to change the name, keep the .mml extension
ws = Workspace(workspace_name = workspace,
               subscription_id = subscription_id,
               resource_group = resource_grp)

ws.get_details()

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

In [2]:
import azureml.core
from azureml.core import Workspace
from azureml.core.run import Run
from azureml.core.experiment import Experiment

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

In [3]:
import os
import numpy as np
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Read final matrix from cosmos db

In [4]:
## Dataset
# Champion Container
readConfig = {
"Endpoint" : "https://lolpredict.documents.azure.com:443/",
"Masterkey" : "lAb4WLi89cpXb7jGeMlLZtXUqvlGJip9NtKxDZMWHKUf0wasuAFknSgTxX3M7RAlK2uyFZjau7AJethdaBRDWw==",
"Database" : "league_extraction",
"Collection" : "FINAL_FEATURES_MATRIX_9112", 
}
final_features_matrix = spark.read.format("com.microsoft.azure.cosmosdb.spark").options(**readConfig).load()

In [5]:
dropped = ['_attachments',
 '_etag',
 '_rid',
 '_self',
 '_ts',
 'id',
 'label',
 'match_id',
 '100_CARRY_champion_inv',
 '100_CARRY_name',
 '100_CARRY_summonerId_inv',
 '100_JUNGLE_champion_inv',
 '100_JUNGLE_name',
 '100_JUNGLE_summonerId_inv',
 '100_MIDDLE_champion_inv',
 '100_MIDDLE_name',
 '100_MIDDLE_summonerId_inv',
 '100_SUPPORT_champion_inv',
 '100_SUPPORT_name',
 '100_SUPPORT_summonerId_inv',
 '100_TOP_champion_inv',
 '100_TOP_name',
 '100_TOP_summonerId_inv',
 '200_CARRY_champion_inv',
 '200_CARRY_name',
 '200_CARRY_summonerId_inv',
 '200_JUNGLE_champion_inv',
 '200_JUNGLE_name',
 '200_JUNGLE_summonerId_inv',
 '200_MIDDLE_champion_inv',
 '200_MIDDLE_name',
 '200_MIDDLE_summonerId_inv',
 '200_SUPPORT_champion_inv',
 '200_SUPPORT_name',
 '200_SUPPORT_summonerId_inv',
 '200_TOP_champion_inv',
 '200_TOP_name',
 '200_TOP_summonerId_inv']
columns = final_features_matrix.schema.names 
not_col = ['id','match_id','label','_attachments', '_etag', '_rid', '_self', '_ts']
features = [col for col in columns if col not in dropped]
features

## Machine learning model

In [6]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = features,outputCol='features')
class_df = vectorAssembler.transform(final_features_matrix)
class_df = class_df.select(['features','label'])
(train_df, test_df) = class_df.randomSplit([0.7,0.3],seed=42)

In [7]:
train_df.count()

In [8]:
test_df.count()

In [9]:
#Train
lr = LogisticRegression()
model = lr.fit(train_df)
# Print the coefficients and intercept for logistic regression
#   print("Coefficients: " + str(model.coefficients))
#   print("Intercept: " + str(model.intercept))
#Train results
train_sum = model.summary
accuracy = train_sum.accuracy
print("Train Accuracy {}".format(accuracy))
train_sum.roc.show()
print("Train areaUnderROC: " + str(train_sum.areaUnderROC))

#test prediction 
predictions = model.transform(test_df)
predictions.select("prediction","label")

#Test evaluation
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# evaluate. note only 2 metrics are supported out of the box by Spark ML.
bce = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')
au_roc = bce.setMetricName('areaUnderROC').evaluate(predictions)
au_prc = bce.setMetricName('areaUnderPR').evaluate(predictions)

truePositive = predictions.select("label").filter("label = 1 and prediction = 1").count()
falsePositive = predictions.select("label").filter("label = 0 and prediction = 1").count()
trueNegative = predictions.select("label").filter("label = 0 and prediction = 0").count()
falseNegative = predictions.select("label").filter("label = 1 and prediction = 0").count()

precision = truePositive/(truePositive + falsePositive)
recall = truePositive/(truePositive + falseNegative)

print("truePositive: " + str(truePositive))
print("falsePositive: " + str(falsePositive))
print("trueNegative: " + str(trueNegative))
print("falseNegative: " + str(falseNegative))
print("precision: " + str(precision))
print("recall: " + str(recall))

# "Official" statical measurement (the closer to 1, the better)
print("Area under ROC: {}".format(au_roc))
print("Area Under PR: {}".format(au_prc))
