In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark.sql import SparkSession, functions, types
spark = SparkSession.builder.appName('New Session').getOrCreate()

In [3]:
from pyspark.sql import SparkSession, functions, types
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [35]:
players_data = spark.read.csv('Downloads/fifa-20-complete-player-dataset/players_20.csv',header=True)
cc = spark.read.csv('Downloads/countryContinent.csv',header=True)
cc = cc.select('country','continent')

In [36]:
players_data = players_data.withColumn('age',players_data['age'].cast(types.IntegerType()))
players_data = players_data.withColumn('weight_kg',players_data['weight_kg'].cast(types.IntegerType()))
players_data = players_data.withColumn('overall',players_data['overall'].cast(types.IntegerType()))
players_data = players_data.withColumn('pace',players_data['pace'].cast(types.IntegerType()))
players_data = players_data.withColumn('passing',players_data['passing'].cast(types.IntegerType()))
players_data = players_data.withColumn('physic',players_data['physic'].cast(types.IntegerType()))
players_data = players_data.withColumn('movement_agility',players_data['movement_agility'].cast(types.IntegerType()))
players_data = players_data.withColumn('power_stamina',players_data['power_stamina'].cast(types.IntegerType()))
players_data = players_data.withColumn('mentality_aggression',players_data['mentality_aggression'].cast(types.IntegerType()))
players_data = players_data.withColumn('shooting',players_data['shooting'].cast(types.IntegerType()))
players_data = players_data.withColumn('dribbling',players_data['dribbling'].cast(types.IntegerType()))
players_data = players_data.withColumn('defending',players_data['defending'].cast(types.IntegerType()))

In [37]:
players_a = players_data.select('age','weight_kg','nationality','club','overall','potential','value_eur','wage_eur','body_type','pace','physic','movement_agility','power_stamina','mentality_aggression','passing','shooting','defending','dribbling')

In [38]:
nationalities_agg = players_data.select('nationality').distinct()
nationalities_agg = nationalities_agg.join(cc, nationalities_agg.nationality == cc.country).drop('country')
players_a = players_a.join(nationalities_agg, players_a.nationality == nationalities_agg.nationality).drop('nationality')
nationalities_agg = players_a.select('continent').distinct()
l = nationalities_agg.collect()
country_list = {}
for i in range(0,len(l)):
    country_list.update({l[i][0]:str(i+1)})
country_list

players_a = players_a.withColumn('body_type',\
                                   functions.when((functions.col('body_type') == 'Lean'),3)\
                                   .when((functions.col('body_type') == 'Normal'),2)\
                                    .when((functions.col('body_type') == 'Stocky'),1)\
                                   .otherwise(0))
players_a = players_a.replace(country_list,1,'continent')
# players_a = players_a.filter(players_a['_c20']!='null')



In [39]:
players_a = players_a.dropna()

In [40]:
df1 = players_a.drop('club','wage_eur')

In [41]:
df1 = df1.withColumn('value_range', \
                                     functions.when((functions.col('value_eur').between(10000, 200000)), 1) \
                                    .when((functions.col('value_eur').between(200000, 400000)), 2)\
                                    .when((functions.col('value_eur').between(400000, 600000)), 3)\
                                    .when((functions.col('value_eur').between(600000, 800000)), 4)\
                                    .when((functions.col('value_eur').between(800000, 1000000)), 5)\
                                    .otherwise(0))

In [42]:
df1 = df1.orderBy(df1.value_eur.desc())

In [43]:
train, validation = df1.randomSplit([0.75,0.25])
train = train.cache()
validation = validation.cache()

In [None]:
feature_vector = VectorAssembler(inputCols=['age','weight_kg','overall','pace','passing','physic','movement_agility', 'power_stamina', 'mentality_aggression','passing','shooting','defending','dribbling'],outputCol='features')
classifier = MultilayerPerceptronClassifier(layers=[13, 130, 6],featuresCol='features',labelCol='value_range',maxIter=500)
ml_pipeline = Pipeline(stages=[feature_vector, classifier])
model = ml_pipeline.fit(train)
model.write().overwrite().save('wage_modeller')