In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import *
from functools import reduce

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("SystemsToolChains") \
    .getOrCreate()

file_paths = []
for year in range(15, 23):
    file_path = f"archive/players_{year}.csv"
    file_paths.append(file_path)

data_frames = []

for file_path in file_paths:
    df = spark.read.csv(file_path, header=True, inferSchema=True)
    
    # Add a new column for the year.
    year = int(file_path.split("_")[1].split(".")[0])
    df = df.withColumn('year', lit(year))
    
    # for test
    df.show(1, vertical = True)
    
    data_frames.append(df)

merged_df = reduce(lambda x, y: x.union(y), data_frames)

merged_df.show(5, vertical = True)

skill_feature1 = ['st', 'rs','lw','lf','cf', 'rf','rw',
              'lam','cam','ram','lm','lcm','cm','rcm',
              'rm','lwb','ldm','cdm','rdm','rwb','lb',
              'lcb','cb','rcb','rb','gk']
skill_feature2 = ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'attacking_crossing',
                 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys',
                 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
                 'movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions',
                 'movement_balance', 'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength',
                 'power_long_shots', 'mentality_aggression', 'mentality_interceptions', 'mentality_positioning',
                 'mentality_vision', 'mentality_penalties', 'defending_marking_awareness', 'defending_standing_tackle',
                 'defending_sliding_tackle', 'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
                 'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed']
feature_list = skill_feature1 + skill_feature2
overall_list = feature_list + ['overall']
player_df = merged_df[overall_list]

missing_count = player_df.select(*[(sum(col(c).isNull().cast("int")).alias(c + "_missing")) for c in player_df.columns])

# show the results
missing_count.show(vertical = True)

columns_to_drop = ['goalkeeping_speed']
columns_to_fill = ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']

feature_list.remove('goalkeeping_speed')
overall_list.remove('goalkeeping_speed')


23/11/14 22:41:29 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
23/11/14 22:41:34 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


-RECORD 0-------------------------------------------
 sofifa_id                   | 158023               
 player_url                  | https://sofifa.co... 
 short_name                  | L. Messi             
 long_name                   | Lionel Andrés Mes... 
 player_positions            | CF                   
 overall                     | 93                   
 potential                   | 95                   
 value_eur                   | 1.005E8              
 wage_eur                    | 550000.0             
 age                         | 27                   
 dob                         | 1987-06-24           
 height_cm                   | 169                  
 weight_kg                   | 67                   
 club_team_id                | 241.0                
 club_name                   | FC Barcelona         
 league_name                 | Spain Primera Div... 
 league_level                | 1                    
 club_position               | CF             



-RECORD 0-------------------------------------
 st_missing                          | 0      
 rs_missing                          | 0      
 lw_missing                          | 0      
 lf_missing                          | 0      
 cf_missing                          | 0      
 rf_missing                          | 0      
 rw_missing                          | 0      
 lam_missing                         | 0      
 cam_missing                         | 0      
 ram_missing                         | 0      
 lm_missing                          | 0      
 lcm_missing                         | 0      
 cm_missing                          | 0      
 rcm_missing                         | 0      
 rm_missing                          | 0      
 lwb_missing                         | 0      
 ldm_missing                         | 0      
 cdm_missing                         | 0      
 rdm_missing                         | 0      
 rwb_missing                         | 0      
 lb_missing  

                                                                                

#### process data

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch

# 1. Feature Type Casting

#print(data.head())
data_o = player_df

data = data_o.toPandas()

for i in skill_feature1:
    # Convert to string type first
    data[i] = data[i].astype(str)
    
    # Then apply the string operations
    data[i] = data[i].str.split('+').str[0]
    data[i] = data[i].str.split('-').str[0]
    
    # Convert back to numeric type, handling errors for non-numeric values
    data[i] = pd.to_numeric(data[i], errors='coerce')

# 2. Column Dropping
for col_name in columns_to_fill:
    mean_value = data[col_name].mean()
    data[col_name].fillna(mean_value, inplace=True)

# 3. Vector Assembling
feature_data = data[feature_list]

# 4. Feature Scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(feature_data)
scaled_features = pd.DataFrame(scaled_features, columns=feature_list)

# 5. Convert to PyTorch Tensor
tensor_data = torch.tensor(scaled_features.to_numpy(), dtype=torch.float32)



                                                                                

In [15]:
# show data
print(feature_data.iloc[0])

st                         89
rs                         89
lw                         92
lf                         90
cf                         90
                           ..
goalkeeping_diving          6
goalkeeping_handling       11
goalkeeping_kicking        15
goalkeeping_positioning    14
goalkeeping_reflexes        8
Name: 0, Length: 65, dtype: int64


In [17]:
true_values = torch.tensor(data['overall'].values, dtype=torch.float32)

In [19]:
print(true_values)

tensor([93., 92., 90.,  ..., 47., 47., 47.])


In [28]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# Assuming tensor_data is your input features and true_values is your target values
# Convert them to tensors if they are not already
tensor_data = torch.tensor(tensor_data)
true_values = torch.tensor(true_values)

# Create the custom dataset
dataset = CustomDataset(tensor_data, true_values)

# Split the dataset into train, validation, and test sets (if needed)
torch.manual_seed(2023)

# Assuming 'dataset' is your PyTorch Dataset
total_size = len(dataset)
train_size = int(0.6 * total_size)
val_size = int(0.2 * total_size)
test_size = total_size - train_size - val_size

# Split the dataset
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


  tensor_data = torch.tensor(tensor_data)
  true_values = torch.tensor(true_values)


In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

# Define the linear regression model
class LinearRegressionModel(nn.Module):
    def __init__(self, input_size):
        super(LinearRegressionModel, self).__init__()
        self.linear = nn.Linear(input_size, 1)

    def forward(self, x):
        return self.linear(x)

# Assuming input_size is the number of features
input_size = train_dataset[0][0].shape[0]
model = LinearRegressionModel(input_size)

# Define RMSE Loss Function
def rmse_loss(output, target):
    return torch.sqrt(torch.mean((output - target) ** 2))

# Grid search parameters
reg_params = [0.01, 0.1, 1.0]
max_iters = [10, 30, 50]

best_rmse = float('inf')
best_model = None
best_params = None

for reg_param in reg_params:
    for max_iter in max_iters:
        # Initialize model and optimizer
        model = LinearRegressionModel(input_size)
        optimizer = optim.SGD(model.parameters(), lr=reg_param)

        # Training loop
        for epoch in range(max_iter):
            for inputs, labels in train_loader:
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = rmse_loss(outputs, labels)
                loss.backward()
                optimizer.step()

        # Validation
        model.eval()
        total_rmse = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                total_rmse += rmse_loss(outputs, labels).item()

        avg_rmse = total_rmse / len(val_loader)
        print(f'Reg Param: {reg_param}, Max Iter: {max_iter}, Validation RMSE: {avg_rmse}')

        # Update best model
        if avg_rmse < best_rmse:
            best_rmse = avg_rmse
            best_model = model
            best_params = {'regParam': reg_param, 'maxIter': max_iter}

print(f'Best Params: {best_params}, Best Validation RMSE: {best_rmse}')


Reg Param: 0.01, Max Iter: 10, Validation RMSE: 7.052768276618408
Reg Param: 0.01, Max Iter: 30, Validation RMSE: 7.052531012543687
Reg Param: 0.01, Max Iter: 50, Validation RMSE: 7.052517952145757
Reg Param: 0.1, Max Iter: 10, Validation RMSE: 7.052557768048467
Reg Param: 0.1, Max Iter: 30, Validation RMSE: 7.053355112806097
Reg Param: 0.1, Max Iter: 50, Validation RMSE: 7.052695629832981
Reg Param: 1.0, Max Iter: 10, Validation RMSE: 9.337242136130461
Reg Param: 1.0, Max Iter: 30, Validation RMSE: 16.89485637561695
Reg Param: 1.0, Max Iter: 50, Validation RMSE: 28.164455955092972
Best Params: {'regParam': 0.01, 'maxIter': 50}, Best Validation RMSE: 7.052517952145757


In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

# Define a simple neural network for regression
class RegressionNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RegressionNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Define RMSE Loss Function
def rmse_loss(output, target):
    return torch.sqrt(torch.mean((output - target) ** 2))

# Hyperparameters to tune
hidden_sizes = [64, 128, 256]
learning_rates = [0.001, 0.01, 0.1]

best_rmse = float('inf')
best_model = None
best_params = None
num_epochs = 10

for hidden_size in hidden_sizes:
    for lr in learning_rates:
        # Initialize model, loss function, and optimizer
        model = RegressionNN(input_size, hidden_size)
        optimizer = optim.Adam(model.parameters(), lr=lr)
        loss_function = nn.MSELoss()

        # Training loop
        for epoch in range(num_epochs):
            model.train()
            for inputs, labels in train_loader:
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = loss_function(outputs, labels)
                loss.backward()
                optimizer.step()

        # Validation
        model.eval()
        total_rmse = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                total_rmse += rmse_loss(outputs, labels).item()

        avg_rmse = total_rmse / len(val_loader)
        print(f'Hidden Size: {hidden_size}, Learning Rate: {lr}, Validation RMSE: {avg_rmse}')

        # Update best model
        if avg_rmse < best_rmse:
            best_rmse = avg_rmse
            best_model = model
            best_params = {'hidden_size': hidden_size, 'learning_rate': lr}

print(f'Best Params: {best_params}, Best Validation RMSE: {best_rmse}')


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Hidden Size: 64, Learning Rate: 0.001, Validation RMSE: 7.175284040940775
Hidden Size: 64, Learning Rate: 0.01, Validation RMSE: 7.087015567599116
Hidden Size: 64, Learning Rate: 0.1, Validation RMSE: 7.2106020289498405
Hidden Size: 128, Learning Rate: 0.001, Validation RMSE: 7.200681208490251
Hidden Size: 128, Learning Rate: 0.01, Validation RMSE: 7.320838622144751
Hidden Size: 128, Learning Rate: 0.1, Validation RMSE: 7.130534578014064
Hidden Size: 256, Learning Rate: 0.001, Validation RMSE: 7.122406677082852
Hidden Size: 256, Learning Rate: 0.01, Validation RMSE: 7.137579348710206
Hidden Size: 256, Learning Rate: 0.1, Validation RMSE: 7.521046274417156
Best Params: {'hidden_size': 64, 'learning_rate': 0.01}, Best Validation RMSE: 7.087015567599116


In [32]:
import torch

# Define RMSE Loss Function (same as before)
def rmse_loss(output, target):
    return torch.sqrt(torch.mean((output - target) ** 2))

# Evaluate on Test Data
best_model.eval()  # Set the model to evaluation mode
total_rmse = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = best_model(inputs)
        total_rmse += rmse_loss(outputs, labels).item()

test_rmse = total_rmse / len(test_loader)
print(f"Test RMSE: {test_rmse}")

Test RMSE: 7.040229406249657
